Esempio n. 1
0
    def _build_graph(self, layer, previous_state):
        with layer_scope(self):

            if previous_state is None:
                input_batch = tf.shape(layer.tensor)[0]
                zero_state = tf.zeros([input_batch, self.n_units])
                self.previous_state = tx.TensorLayer(zero_state, self.n_units)

            if self.share_state_with is None:
                # determines the weight of the previous state
                # we could add the bias at the end but this way we just define a single bias for the r unit
                self.r_current_w = tx.Linear(layer,
                                             self.n_units,
                                             bias=True,
                                             weight_init=self.init,
                                             name="r_current_w")
                self.r_recurrent_w = tx.Linear(self.previous_state,
                                               self.n_units,
                                               bias=False,
                                               weight_init=self.recurrent_init,
                                               name="r_current_w")

                self.u_current_w = tx.Linear(layer,
                                             self.n_units,
                                             bias=True,
                                             weight_init=self.init,
                                             name="u_current_w")
                self.u_recurrent_w = tx.Linear(self.previous_state,
                                               self.n_units,
                                               bias=False,
                                               weight_init=self.recurrent_init,
                                               name="u_current_w")

                self.current_w = tx.Linear(layer,
                                           self.n_units,
                                           bias=True,
                                           weight_init=self.init,
                                           name="current_w")
                self.recurrent_w = tx.Linear(self.previous_state,
                                             self.n_units,
                                             bias=False,
                                             weight_init=self.recurrent_init,
                                             name="recurrent_w")

                # kernel_gate = tx.Activation()

                kernel_act = tx.Activation(kernel_linear, self.activation)
                self.kernel = tx.Compose(kernel_linear, kernel_act)

            else:
                self.kernel = self.share_state_with.kernel.reuse_with(layer)
                self.recurrent_kernel = self.share_state_with.recurrent_kernel.reuse_with(
                    self.previous_state)

            r_state = tx.Add(r_current_w, r_recurrent_w)
            r_state = tx.Bias(r_state)
            r_gate = tx.Activation(r_state, fn=tx.sigmoid, name="r_gate")

            # """Gated recurrent unit (GRU) with nunits cells."""
            return self.kernel.tensor + self.recurrent_kernel.tensor
Esempio n. 2
0
def test_loss_model_dependencies():
    inputs = tx.Input(n_units=2, name="x", constant=False)
    labels = tx.Input(n_units=2, name="y_", constant=False)
    y = tx.Linear(inputs, 2, name="y")
    out1 = tx.Activation(y, tf.nn.softmax, name="out1")
    out2 = tx.Activation(y, tf.nn.softmax, name="out2")

    @tx.layer(n_units=2, name="loss")
    def loss(pred, labs):
        return tf.losses.categorical_crossentropy(labs, pred)

    logging.basicConfig(level=logging.DEBUG)

    model = tx.Model(run_inputs=inputs,
                     run_outputs=[out1, out2],
                     train_inputs=[inputs, labels],
                     train_outputs=[out2, out1],
                     train_loss=loss(out1, labels))

    lr = tx.Param(0.5)
    opt = model.set_optimizer(tf.optimizers.SGD, lr=lr)
    assert isinstance(opt, tf.optimizers.Optimizer)

    it = model.train_graph.dependency_iter()
    layers = list(it)
    assert layers[0] is inputs
    assert layers[1] is labels
    assert len(layers) == 6
Esempio n. 3
0
def test_model_run():
    data1 = tf.constant([[1., 1.]])

    x = tx.Input(n_units=2, name="x", constant=False)
    labels = tx.Input(n_units=2, name="y_", constant=False)
    y = tx.Linear(x, 2, name="y")
    out1 = tx.Activation(y, tf.nn.softmax)
    out2 = tx.Activation(y, tf.nn.softmax)

    @tx.layer(n_units=2, name="loss")
    def loss(pred, labs):
        return tf.losses.categorical_crossentropy(labs, pred)

    model = tx.Model(run_inputs=x,
                     run_outputs=[out1, out2],
                     train_inputs=[x, labels],
                     train_outputs=out1,
                     train_loss=loss(out1, labels))

    model.set_optimizer(tf.optimizers.SGD, lr=0.5)

    result1 = model.run({x: data1})
    result2 = model.run([data1])

    assert tx.tensor_equal(result1[0], result2[0])
    assert tx.tensor_equal(result1[1], result2[1])

    result3 = model.run({x: data1}, compiled_graph=True)
    assert tx.tensor_equal(result3[0], result2[0])
    assert tx.tensor_equal(result3[1], result2[1])
Esempio n. 4
0
def test_set_optimizer():
    x = tx.Input(n_units=2, name="x", constant=False)
    labels = tx.Input(n_units=2, name="labels", constant=False)
    y = tx.Linear(x, 2, name="y")
    out1 = tx.Activation(y, tf.nn.softmax)
    out2 = tx.Activation(y, tf.nn.softmax)

    @tx.layer(n_units=2, name="loss")
    def loss(pred, labs):
        return tf.losses.categorical_crossentropy(labs, pred)

    model = tx.Model(run_inputs=x,
                     run_outputs=[out1, out2],
                     train_inputs=[x, labels],
                     train_outputs=[out2, out1],
                     train_loss=loss(out1, labels))

    lr = tx.Param(0.5)
    opt = model.set_optimizer(tf.optimizers.SGD,
                              learning_rate=lr,
                              clipnorm=0.1)

    assert isinstance(opt, tf.optimizers.Optimizer)

    assert model.optimizer.get_config()["learning_rate"] == 0.5

    data1 = [[1., 1.], [1., 1.]]
    data2 = tf.constant([[0., 1.], [0., 1.]])
    params = model.optimizer_params[model.optimizer]
    data_dict, params_dict = tx.Model.parse_input(
        {
            x: data1,
            "learning_rate": 0.2
        }, model.run_graph.in_nodes, params)
    assert len(data_dict) == 1
    assert len(params_dict) == 1
    assert model.optimizer_params[opt]["learning_rate"] is lr

    result1 = model.train_step({x: data1, labels: data2})
    result2 = model.train_step([data1, data2])

    assert len(result1) == 3
    assert len(result2) == 3
    assert tf.reduce_all(tf.less(result2[-1], result1[-1]))

    result1 = model.run({x: np.array(data1, dtype=np.float32)})
    result2 = model.run([data1])
    result3 = model.run(np.array(data1, np.float32))

    x.value = data1
    o2 = out2()
    o1 = out1()

    result4 = (o2, o1)

    for i in range(2):
        assert tx.tensor_equal(result1[i], result2[i])
        assert tx.tensor_equal(result1[i], result3[i])
        assert tx.tensor_equal(result1[i], result4[i])
Esempio n. 5
0
def test_override_out_nodes():
    x = tx.Input(n_units=2, name="x", constant=False)
    y = tx.Linear(x, 2, name="y")
    out1 = tx.Activation(y, tf.nn.softmax, name="out1")
    out2 = tx.Activation(out1, tf.nn.softmax, name="out2")

    graph = Graph.build(inputs=x, outputs=[out1, out2])
    assert out1 in graph.out_nodes
    assert out2 in graph.out_nodes

    graph = Graph.build(inputs=x, outputs=out1)
    assert out1 in graph.out_nodes
    assert out2 not in graph.out_nodes
Esempio n. 6
0
    def _build_graph(self, layer, previous_state):
        with layer_scope(self):

            if previous_state is None:
                input_batch = tf.shape(layer.tensor)[0]
                zero_state = tf.zeros([input_batch, self.n_units])
                self.previous_state = tx.TensorLayer(zero_state, self.n_units)

            if self.share_state_with is None:
                kernel_linear = tx.Linear(layer,
                                          self.n_units,
                                          bias=True,
                                          weight_init=self.init,
                                          name="linear_kernel")
                kernel_act = tx.Activation(kernel_linear, self.activation)
                self.kernel = tx.Compose([kernel_linear, kernel_act])

                self.recurrent_kernel = tx.Linear(
                    self.previous_state,
                    self.n_units,
                    bias=False,
                    weight_init=self.recurrent_init,
                    name="recurrent_kernel")
            else:
                self.kernel = self.share_state_with.kernel.reuse_with(layer)
                self.recurrent_kernel = self.share_state_with.recurrent_kernel.reuse_with(
                    self.previous_state)

            # TODO this might be wrong, I might need to couple the activation: act(kernel + recurrent + bias)
            # TODO it is wrong https://github.com/tensorflow/tensorflow/blob/r1.8/tensorflow/python/ops/rnn_cell_impl.py
            # """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
            return self.kernel.tensor + self.recurrent_kernel.tensor
Esempio n. 7
0
def test_reuse_dropout():
    x1 = tx.Constant(np.ones(shape=[2, 4]), dtype=tf.float32)
    x2 = tx.Activation(x1)
    drop1 = tx.Dropout(x2, probability=0.5, locked=True)

    assert len(drop1.inputs) == 2
    assert drop1.inputs[0] is x2
    assert drop1.inputs[-1] is drop1.layer_state.mask

    # shared state overrides mask?
    _, mask = tx.dropout(x2, return_mask=True)
    drop2 = drop1.reuse_with(x2, mask)

    assert len(drop2.inputs) == 2
    assert drop2.inputs[0] is x2
    assert drop2.inputs[-1] is drop2.layer_state.mask

    assert not tx.tensor_equal(drop1(), drop2())

    graph = tx.Graph.build(inputs=None, outputs=[drop1, drop2])

    out1, out2 = graph()
    assert tx.tensor_equal(out1, out2)

    drop1 = tx.Dropout(x2, probability=0.5)
    drop2 = drop1.reuse_with(x1)

    graph.eval(drop1, drop2)
Esempio n. 8
0
def test_to_sparse():
    inputs = tx.Input(init_value=tf.ones([2, 100]))
    linear = tx.Linear(inputs, n_units=100)
    relu = tx.Activation(linear, tx.relu)
    sparse = tx.ToSparse(relu)

    assert tx.shape_equal(sparse.shape, linear.shape)
    assert tx.shape_equal(sparse.shape, relu.shape)
Esempio n. 9
0
def test_build_graph():
    x1 = tx.Input(n_units=1000, constant=False, dtype=tf.float32)
    x2 = tx.Input(init_value=tf.ones([1, 3]), dtype=tf.float32, constant=True)

    y10 = tx.Linear(x1, n_units=3)
    y11 = tx.Activation(y10)
    y1 = tx.Module(x1, y11)
    y2 = tx.Add(y1, x2)
    output = y2

    graph = Graph.build(inputs=None, outputs=[y1, y2])
    # module condenses 2 nodes so it's 4 and not 6
    assert len(graph.nodes) == 4

    @tf.function
    def simple_graph(in0):
        x1.value = in0
        return y2()

    simple_graph_2 = Graph.build(inputs=[x1, x2], outputs=y2)
    simple_graph_2 = tf.function(simple_graph_2)
    g = Graph.build(inputs=[x1, x2], outputs=y2)
    y2fn = y2.as_function()
    data = tf.ones([256, 1000])
    x1.value = data

    compiled_fn = g.as_function(ord_inputs=x1, ord_outputs=output)

    assert tx.tensor_equal(compiled_fn(data), y2fn())
    assert tx.tensor_equal(compiled_fn(data), simple_graph_2()[0])

    from timeit import timeit

    def update_run():
        x1.value = tf.random.uniform([256, 1000])
        return y2fn()

    n = 1000
    t_update_run = timeit(update_run, number=n)
    t_generated = timeit(lambda: compiled_fn(tf.random.uniform([256, 1000])),
                         number=n)
    t_compile_value_set = timeit(
        lambda: simple_graph(tf.random.uniform([256, 1000])), number=n)
    t_graph_call_tf = timeit(
        lambda: simple_graph_2(tf.random.uniform([256, 1000])), number=n)

    assert t_generated < t_update_run
    assert t_generated < t_compile_value_set
    assert t_generated < t_graph_call_tf
    assert t_update_run > t_compile_value_set

    o1 = compiled_fn(tf.random.uniform([256, 1000]))
    o2 = compiled_fn(tf.random.uniform([256, 1000]))
    assert not tx.tensor_equal(o1, o2)
Esempio n. 10
0
def test_model_train():
    x = tx.Input(n_units=2, name="x", constant=False)
    labels = tx.Input(n_units=2, name="labels", constant=False)
    y = tx.Linear(x, 2, name="y1", add_bias=False)
    out1 = tx.Activation(y, tf.nn.softmax)
    out2 = tx.Activation(y, tf.nn.softmax)

    @tx.layer(n_units=2, name="loss")
    def loss(pred, labs):
        return tf.losses.categorical_crossentropy(labs, pred)

    model = tx.Model(run_inputs=x,
                     run_outputs=[out1, out2],
                     train_inputs=[x, labels],
                     train_outputs=[out2, out1],
                     train_loss=loss(out1, labels))

    lr = tx.Param(0.5)
    opt = model.set_optimizer(tf.optimizers.SGD,
                              learning_rate=lr,
                              clipnorm=0.1)

    data1 = [[1., 1.], [1., 1.]]
    data2 = [[0., 1.], [0., 1.]]

    w1 = y.weights.numpy()

    epochs = 100
    model.train(train_data=[{x: data1, labels: data2}], epochs=epochs)

    w2 = y.weights.value()

    y.weights.assign(w1)

    for _ in range(epochs):
        model.train_step(input_feed={x: data1, labels: data2})

    w3 = y.weights.value()

    assert tx.tensor_equal(w2, w3)
Esempio n. 11
0
def test_gate():
    inputs = tx.Input(init_value=tf.ones([2, 3]))
    linear = tx.Linear(inputs, n_units=4)
    nop = tx.Activation(linear, fn=tx.identity)
    gate_w = tx.Linear(linear, n_units=4, add_bias=True)
    gate1 = tx.Gate(linear, gate_w)
    gate2 = gate1.reuse_with(nop)

    assert tx.shape_equal(gate1.shape, gate2.shape)

    r1 = gate1()
    r2 = gate2()

    assert tx.tensor_equal(r1, r2)
Esempio n. 12
0
def test_fully_connected():
    x1 = tx.Input(init_value=[[1., 1., 1., 1.]],
                  n_units=4,
                  dtype=tf.float32,
                  constant=True)
    x2 = tx.Input(init_value=np.random.uniform(size=[2, 4]),
                  dtype=tf.float32,
                  n_units=4,
                  constant=True)

    y1 = tx.FC(x1, 4, add_bias=True, activation=tf.sigmoid)

    y2 = tx.Linear(x1,
                   4,
                   add_bias=True,
                   weights=y1.linear.weights,
                   bias=y1.linear.bias)
    a2 = tx.Activation(y2, fn=tf.sigmoid)

    w = y2.weights
    b = y2.bias

    assert y1.linear.weights is w
    assert y1.linear.bias is b

    x = x1()
    y = tf.matmul(x, w) + b
    a = tf.sigmoid(y)

    assert tx.tensor_equal(y2(), y)
    assert tx.tensor_equal(y1(), a)
    assert tx.tensor_equal(y1(), a2())
    assert tx.tensor_equal(a2(), a)

    y1 = y1.reuse_with(x2)
    y2 = y2.reuse_with(x2)

    assert y2.weights is w
    assert y2.bias is b

    assert y1.linear.weights is w
    assert y1.linear.bias is b
Esempio n. 13
0
def test_activation():
    inputs = tx.Input(init_value=tf.ones([2, 2]), n_units=2)
    output = tx.Activation(inputs, tf.sigmoid)
    assert tx.shape_equal(inputs.shape, output.shape)
data = np.concatenate([v, labels], -1)

data = repeat_it(data, 2)

data = shuffle_it(iter(data), buffer_size=batch_size * 4)
data = batch_it(data, batch_size)

label_layer = tx.Input(1)
in_layer = tx.Input(M)

f1 = tx.FC(in_layer, 512, activation=tf.nn.tanh)
f2 = tx.FC(f1, 512, activation=tf.nn.relu)
fm = tx.Highway(f1, f2, carry_gate=True)

out = tx.Linear(f2, 1)
out_prob = tx.Activation(out, fn=tx.sigmoid)

loss = tx.binary_cross_entropy(labels=label_layer.tensor, logits=out.tensor)

model = tx.Model(run_inputs=in_layer,
                 run_outputs=out_prob,
                 train_in_loss=label_layer,
                 train_out_loss=loss)

runner = tx.ModelRunner(model)
runner.config_optimizer(optimizer=tf.train.AdamOptimizer(learning_rate=0.001))
runner.init_vars()

for data_batch in data:
    data_batch = np.array(data_batch)
    ctx_vector = data_batch[:, :-1]
Esempio n. 15
0
    def __init__(self,
                 ctx_size,
                 vocab_size,
                 k_dim,
                 ri_tensor: RandomIndexTensor,
                 embed_dim,
                 embed_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 logit_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 embed_share=True,
                 logit_bias=False,
                 use_gate=True,
                 use_hidden=False,
                 h_dim=100,
                 h_activation=tx.elu,
                 h_init=tx.he_normal_init(),
                 h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 use_dropout=True,
                 embed_dropout=False,
                 keep_prob=0.95,
                 l2_loss=False,
                 l2_loss_coef=1e-5):

        # GRAPH INPUTS
        run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input")
        loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target")
        eval_inputs = loss_inputs

        # RUN GRAPH =====================================================
        var_reg = []
        with tf.name_scope("run"):
            # RI ENCODING ===============================================
            # convert ids to ris gather a set of random indexes based on the ids in a sequence

            # ri_layer = tx.TensorLayer(ri_tensor, n_units=k_dim)
            # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor)
            with tf.name_scope("ri_encode"):
                # used to compute logits
                if isinstance(ri_tensor, RandomIndexTensor):
                    ri_layer = tx.TensorLayer(ri_tensor.to_sparse_tensor(),
                                              k_dim)

                    ri_inputs = ri_tensor.gather(run_inputs.tensor)
                    ri_inputs = ri_inputs.to_sparse_tensor()
                    ri_inputs = tx.TensorLayer(ri_inputs, k_dim)
                else:
                    ri_layer = tx.TensorLayer(ri_tensor, k_dim)
                    ri_inputs = tx.gather_sparse(ri_layer.tensor,
                                                 run_inputs.tensor)
                    ri_inputs = tx.TensorLayer(ri_inputs, k_dim)

            # use those sparse indexes to lookup a set of features based on the ri values
            feature_lookup = tx.Lookup(ri_inputs,
                                       ctx_size, [k_dim, embed_dim],
                                       embed_init,
                                       name="lookup")
            var_reg.append(feature_lookup.weights)
            feature_lookup = feature_lookup.as_concat()
            # ===========================================================

            if use_gate or use_hidden:
                hl = tx.Linear(feature_lookup,
                               h_dim,
                               h_init,
                               bias=True,
                               name="h_linear")
                ha = tx.Activation(hl, h_activation, name="h_activation")
                h = tx.Compose(hl, ha, name="hidden")
                var_reg.append(hl.weights)

            features = feature_lookup
            if use_gate:
                features = tx.Gate(features, ctx_size, gate_input=h)
                gate = features
                var_reg.append(features.gate_weights)

            x_to_f = tx.Linear(features,
                               embed_dim,
                               x_to_f_init,
                               bias=True,
                               name="x_to_f")
            var_reg.append(x_to_f.weights)
            f_prediction = x_to_f

            if use_hidden:
                h_to_f = tx.Linear(h,
                                   embed_dim,
                                   h_to_f_init,
                                   bias=True,
                                   name="h_to_f")
                var_reg.append(h_to_f.weights)
                f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted")

            # RI DECODING ===============================================
            shared_weights = feature_lookup.weights if embed_share else None
            logit_init = logit_init if not embed_share else None
            # embedding feature vectors for all words: shape [vocab_size, embed_dim]
            # later, for NCE we don't need to get all the features

            all_embeddings = tx.Linear(ri_layer,
                                       embed_dim,
                                       logit_init,
                                       shared_weights,
                                       name="logits",
                                       bias=False)

            # dot product of f_predicted . all_embeddings with bias for each target word

            run_logits = tx.Linear(f_prediction,
                                   n_units=vocab_size,
                                   shared_weights=all_embeddings.tensor,
                                   transpose_weights=True,
                                   bias=logit_bias)

            if not embed_share:
                var_reg.append(all_embeddings.weights)

            # ===========================================================
            run_embed_prob = tx.Activation(run_logits, tx.softmax)

        # TRAIN GRAPH ===================================================
        with tf.name_scope("train"):
            if use_dropout and embed_dropout:
                feature_lookup = feature_lookup.reuse_with(ri_inputs)
                features = tx.Dropout(feature_lookup, probability=keep_prob)
            else:
                features = feature_lookup

            if use_gate or use_hidden:
                if use_dropout:
                    h = h.reuse_with(features)
                    h = tx.Dropout(h, probability=keep_prob)

                if use_gate:
                    features = gate.reuse_with(features, gate_input=h)

                f_prediction = x_to_f.reuse_with(features)

                if use_hidden:
                    h_to_f = h_to_f.reuse_with(h)
                    if use_dropout:
                        h_to_f = tx.Dropout(h_to_f, probability=keep_prob)
                    f_prediction = tx.Add(f_prediction, h_to_f)
            else:
                f_prediction = f_prediction.reuse_with(features)

            # we already define all_embeddings from which these logits are computed before so this should be ok
            train_logits = run_logits.reuse_with(f_prediction)

            train_embed_prob = tx.Activation(train_logits,
                                             tx.softmax,
                                             name="train_output")

            one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor,
                                       num_cols=vocab_size)
            train_loss = tx.categorical_cross_entropy(one_hot,
                                                      train_logits.tensor)

            train_loss = tf.reduce_mean(train_loss)

            if l2_loss:
                losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = train_loss + l2_loss_coef * tf.add_n(losses)

        # EVAL GRAPH ===============================================
        with tf.name_scope("eval"):
            one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor,
                                       num_cols=vocab_size)
            eval_loss = tx.categorical_cross_entropy(one_hot,
                                                     run_logits.tensor)
            eval_loss = tf.reduce_mean(eval_loss)

        # SETUP MODEL CONTAINER ====================================
        super().__init__(run_inputs=run_inputs,
                         run_outputs=run_embed_prob,
                         train_inputs=run_inputs,
                         train_outputs=train_embed_prob,
                         eval_inputs=run_inputs,
                         eval_outputs=run_embed_prob,
                         train_out_loss=train_loss,
                         train_in_loss=loss_inputs,
                         eval_out_score=eval_loss,
                         eval_in_score=eval_inputs)
Esempio n. 16
0
    def __init__(
        self,
        inputs,
        label_inputs,
        vocab_size,
        embed_dim,
        h_dim,
        embed_init=tx.random_uniform(minval=-0.01, maxval=0.01),
        logit_init=tx.random_uniform(minval=-0.01, maxval=0.01),
        num_h=1,
        h_activation=tx.elu,
        h_init=tx.he_normal_init(),
        use_dropout=False,
        embed_dropout=False,
        drop_probability=0.05,
        l2_loss=False,
        l2_weight=1e-5,
        use_f_predict=False,
        f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
        embed_share=False,
        logit_bias=False,
        use_nce=False,
        nce_samples=10,
    ):
        if not isinstance(inputs, tx.Input):
            raise TypeError("inputs must be an Input layer")
        self.inputs = inputs
        self.labels = label_inputs
        if not isinstance(label_inputs, tx.Input):
            raise TypeError("labels must be an Input layer")

        if inputs.dtype != tf.int32 and inputs.dtype != tf.int64:
            raise TypeError(
                "Invalid dtype for input: expected int32 or int64, got {}".
                format(inputs.dtype))

        if num_h < 0:
            raise ValueError("num hidden should be >= 0")

        ctx_size = inputs.n_units
        # ===============================================
        # RUN GRAPH
        # ===============================================
        var_reg = []

        with tf.name_scope("run"):
            # feature lookup
            embeddings = tx.Lookup(inputs,
                                   ctx_size, [vocab_size, embed_dim],
                                   weight_init=embed_init)
            var_reg.append(embeddings.weights)
            feature_lookup = embeddings.as_concat()

            last_layer = feature_lookup
            h_layers = []
            for i in range(num_h):
                h_i = tx.FC(layer=last_layer,
                            n_units=h_dim,
                            activation=h_activation,
                            weight_init=h_init,
                            add_bias=True,
                            name="h_{}".format(i + 1))
                h_layers.append(h_i)
                last_layer = h_i
                var_reg.append(h_i.linear.weights)

            # feature prediction for Energy-Based Model
            if use_f_predict:
                last_layer = tx.Linear(last_layer,
                                       embed_dim,
                                       f_init,
                                       add_bias=True,
                                       name="f_predict")
                var_reg.append(last_layer.weights)
                f_predict = last_layer

            shared_weights = feature_lookup.weights if embed_share else None
            transpose_weights = embed_share
            logit_init = logit_init if not embed_share else None
            run_logits = tx.Linear(last_layer,
                                   n_units=vocab_size,
                                   weight_init=logit_init,
                                   shared_weights=shared_weights,
                                   transpose_weights=transpose_weights,
                                   add_bias=logit_bias,
                                   name="logits")

            if not embed_share:
                var_reg.append(run_logits.weights)

            run_output = tx.Activation(run_logits,
                                       tx.softmax,
                                       name="run_output")

        # ===============================================
        # TRAIN GRAPH
        # ===============================================
        with tf.name_scope("train"):
            if use_dropout and embed_dropout:
                last_layer = tx.Dropout(feature_lookup,
                                        probability=drop_probability,
                                        name="dropout_features")
            else:
                last_layer = feature_lookup

            # add dropout between each layer
            for i, layer in enumerate(h_layers):
                h = layer.reuse_with(last_layer)
                if use_dropout:
                    h = tx.Dropout(h,
                                   probability=drop_probability,
                                   name="dropout_{}".format(i + 1))
                last_layer = h

            # feature prediction for Energy-Based Model
            if use_f_predict:
                last_layer = f_predict.reuse_with(last_layer)

            train_logits = run_logits.reuse_with(last_layer,
                                                 name="train_logits")
            train_output = tx.Activation(train_logits,
                                         tx.softmax,
                                         name="train_output")

            def categorical_loss(labels, logits):
                labels = tx.dense_one_hot(column_indices=labels,
                                          num_cols=vocab_size)
                loss = tx.categorical_cross_entropy(labels=labels,
                                                    logits=logits)
                return tf.reduce_mean(loss)

            def nce_loss(labels, weights, bias, predict):
                noise = uniform_sampler(labels, 1, nce_samples, True,
                                        vocab_size)
                loss = tf.nn.nce_loss(weights=weights,
                                      biases=bias,
                                      inputs=predict,
                                      labels=labels,
                                      num_sampled=nce_samples,
                                      num_classes=vocab_size,
                                      num_true=1,
                                      sampled_values=noise)
                return tf.reduce_mean(loss)

            if use_nce:
                bias = tx.VariableLayer(var_shape=[vocab_size],
                                        name="nce_bias")

                nce_weights = tx.WrapLayer(embeddings,
                                           n_units=embeddings.n_units,
                                           wrap_fn=lambda x: x.weights,
                                           layer_fn=True)
                train_loss = tx.LambdaLayer(label_inputs,
                                            nce_weights,
                                            bias,
                                            last_layer,
                                            apply_fn=nce_loss,
                                            name="nce_loss")
            else:
                train_loss = tx.LambdaLayer(label_inputs,
                                            train_logits,
                                            apply_fn=categorical_loss,
                                            name="train_loss")

            if l2_loss:
                l2_losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = tx.WrapLayer(
                    train_loss,
                    wrap_fn=lambda x: x + l2_weight * tf.add_n(l2_losses),
                    name="train_loss_l2")

        # ===============================================
        # EVAL GRAPH
        # ===============================================
        with tf.name_scope("eval"):
            eval_loss = tx.LambdaLayer(label_inputs,
                                       run_logits,
                                       apply_fn=categorical_loss,
                                       name="eval_loss")

        # BUILD MODEL
        super().__init__(run_outputs=run_output,
                         run_inputs=inputs,
                         train_inputs=[inputs, label_inputs],
                         train_outputs=train_output,
                         train_loss=train_loss,
                         eval_inputs=[inputs, label_inputs],
                         eval_outputs=run_output,
                         eval_score=eval_loss)
Esempio n. 17
0
all_embeddings = tx.Linear(ri_layer,
                           embed_size,
                           shared_weights=lookup.weights,
                           name="all_features",
                           bias=False)

# dot product of f_predicted . all_embeddings with bias for each target word
run_logits = tx.Linear(feature_predict,
                       vocab_size,
                       shared_weights=all_embeddings.tensor,
                       transpose_weights=True,
                       bias=False,
                       name="logits")

embed_prob = tx.Activation(run_logits, tx.softmax, name="run_output")

one_hot = tx.dense_one_hot(column_indices=input_labels.tensor,
                           num_cols=vocab_size)
val_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor)
val_loss = tf.reduce_mean(val_loss)

# *************************************
#   Testing adaptive noise
# *************************************
noise_logits = tx.Linear(lookup, k, bias=True)
adaptive_noise = tx.sample_sigmoid_from_logits(noise_logits.tensor, n=1)
adaptive_noise = tx.TensorLayer(adaptive_noise, n_units=k)
# adaptive_noise = tx.to_sparse(adaptive_noise)

# *************************************
Esempio n. 18
0
    def __init__(self,
                 inputs,
                 labels,
                 vocab_size,
                 embed_dim,
                 h_dim,
                 embed_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 logit_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 num_h=1,
                 h_activation=tx.tanh,
                 h_init=tx.he_normal_init(),
                 reset_state=True,
                 embed_dropout=False,
                 w_dropout=False,
                 u_dropconnect=False,
                 other_dropout=False,
                 w_keep_prob=0.9,
                 u_keep_prob=0.9,
                 embed_keep_prob=0.9,
                 other_keep_prob=0.9,
                 l2_loss=False,
                 l2_weight=1e-5,
                 use_f_predict=False,
                 f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 embed_share=False,
                 logit_bias=False,
                 use_nce=False,
                 nce_samples=10,
                 ):
        if not isinstance(inputs, tx.Input):
            raise TypeError("inputs must be an Input layer")
        self.inputs = inputs
        self.labels = labels
        if not isinstance(labels, tx.Input):
            raise TypeError("labels must be an Input layer")

        if inputs.dtype != tf.int32 and inputs.dtype != tf.int64:
            raise TypeError("Invalid dtype for input: expected int32 or int64, got {}".format(inputs.dtype))

        if num_h < 0:
            raise ValueError("num hidden should be >= 0")

        ctx_size = inputs.n_units
        # ===============================================
        # RUN GRAPH
        # ===============================================
        var_reg = []

        with tf.name_scope("run"):
            # feature lookup
            embeddings = tx.Lookup(inputs, ctx_size, [vocab_size, embed_dim], weight_init=embed_init)
            var_reg.append(embeddings.weights)
            feature_lookup = embeddings.permute_batch_time()

            last_layer = feature_lookup
            last_feature_layer = feature_lookup


            for i in range(num_h):
                h_i = tx.QRNN(feature_lookup,
                              n_units=h_dim,
                              activation=h_activation,
                              filter_size=
                              )


                last_layer = h_i
                # save last state, this will be used by state of first cell

                var_reg += [wi.weights for wi in last_layer.w]
                var_reg += [ui.weights for ui in last_layer.u]

            if not reset_state:
                last_layer = zero_state.reuse_with(last_layer, name="cache_last_state")

            # feature prediction for Energy-Based Model
            if use_f_predict:
                last_layer = tx.Linear(last_layer, embed_dim, f_init, add_bias=True, name="f_predict")
                var_reg.append(last_layer.weights)
                f_predict = last_layer

            shared_weights = feature_lookup.weights if embed_share else None
            transpose_weights = embed_share
            logit_init = logit_init if not embed_share else None
            run_logits = tx.Linear(last_layer,
                                   n_units=vocab_size,
                                   weight_init=logit_init,
                                   shared_weights=shared_weights,
                                   transpose_weights=transpose_weights,
                                   add_bias=logit_bias,
                                   name="logits")

            if not embed_share:
                var_reg.append(run_logits.weights)

            run_output = tx.Activation(run_logits, tx.softmax, name="run_output")

        # ===============================================
        # TRAIN GRAPH
        # ===============================================
        with tf.name_scope("train"):
            embeddings = embeddings.reuse_with(inputs)
            feature_lookup = embeddings.as_seq()

            if other_dropout and embed_dropout:
                feature_lookup = tx.Dropout(feature_lookup, probability=embed_keep_prob, name="drop_features")

            # last_layer = last_layer.as_seq()

            # add dropout between each layer
            # for i, layer in enumerate(h_layers):
            cell = lstm_cells[0]

            for i in range(ctx_size):
                if i == 0:
                    h = cell.reuse_with(input_layer=feature_lookup[i],
                                        previous_state=None,  # copy from first cell
                                        previous_memory=None,  # copy from first cell
                                        regularized=w_dropout or u_dropconnect,
                                        name="lstm_cell_{}".format(i))

                else:
                    h = cell.reuse_with(input_layer=feature_lookup[i],
                                        previous_state=last_layer,
                                        name="lstm_cell_{}".format(i))

                cell = h
                # if use_dropout:
                #    h = tx.ZoneOut(h,
                #                   previous_layer=h.previous_state,
                #                   keep_prob=keep_prob,
                #                   name="zoneout_{}".format(i))
                last_layer = h
            if not reset_state:
                last_layer = zero_state.reuse_with(last_layer, name="cache_last_cell")

            # feature prediction for Energy-Based Model
            if use_f_predict:
                last_layer = f_predict.reuse_with(last_layer)

            train_logits = run_logits.reuse_with(last_layer, name="train_logits")
            train_output = tx.Activation(train_logits, tx.softmax, name="train_output")

            def categorical_loss(labels, logits):
                labels = tx.dense_one_hot(column_indices=labels, num_cols=vocab_size)
                loss = tx.categorical_cross_entropy(labels=labels, logits=logits)
                # loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels,logits=logits)
                return tf.reduce_mean(loss)

            def nce_loss(labels, weights, bias, predict):
                noise = uniform_sampler(labels, 1, nce_samples, True, vocab_size)
                loss = tf.nn.nce_loss(weights=weights,
                                      biases=bias,
                                      inputs=predict,
                                      labels=labels,
                                      num_sampled=nce_samples,
                                      num_classes=vocab_size,
                                      num_true=1,
                                      sampled_values=noise)
                return tf.reduce_mean(loss)

            if use_nce:
                bias = tx.VariableLayer(var_shape=[vocab_size], name="nce_bias")

                nce_weights = tx.WrapLayer(embeddings,
                                           n_units=embeddings.n_units,
                                           wrap_fn=lambda x: x.weights,
                                           layer_fn=True)
                train_loss = tx.LambdaLayer(labels, nce_weights, bias, last_layer, apply_fn=nce_loss, name="nce_loss")
            else:
                train_loss = tx.LambdaLayer(labels, train_logits, apply_fn=categorical_loss, name="train_loss")

            if l2_loss:
                l2_losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = tx.LambdaLayer(train_loss,
                                            apply_fn=lambda x: x + l2_weight * tf.add_n(l2_losses),
                                            name="train_loss_l2")

        # ===============================================
        # EVAL GRAPH
        # ===============================================
        with tf.name_scope("eval"):
            eval_loss = tx.LambdaLayer(labels, run_logits, apply_fn=categorical_loss, name="eval_loss")

        # BUILD MODEL
        super().__init__(run_outputs=run_output,
                         run_inputs=inputs,
                         train_inputs=[inputs, labels],
                         train_outputs=train_output,
                         train_loss=train_loss,
                         eval_inputs=[inputs, labels],
                         eval_outputs=run_output,
                         eval_score=eval_loss)
Esempio n. 19
0
from tqdm import tqdm

n_hidden = 20
embed_dim = 10
seq_size = 2
vocab_size = 100
feature_shape = [vocab_size, embed_dim]

loss_inputs = tx.Input(1, dtype=tf.int32)
in_layer = tx.Input(seq_size, dtype=tf.int32)

lookup = tx.Lookup(in_layer, seq_size=seq_size, lookup_shape=feature_shape)
# [batch x seq_size * feature_shape[1]]

h = tx.Linear(lookup, n_hidden, bias=True)
ha = tx.Activation(h, tx.elu)
h = tx.Compose(h, ha)

logits = tx.Linear(h, vocab_size, bias=True)
out = tx.Activation(logits, tx.softmax)

labels = tx.dense_one_hot(loss_inputs.tensor, vocab_size)
loss = tf.reduce_mean(
    tx.categorical_cross_entropy(labels=labels, logits=logits.tensor))

# setup optimizer
optimizer = tx.AMSGrad(learning_rate=0.01)

model = tx.Model(run_inputs=in_layer,
                 run_outputs=out,
                 train_inputs=in_layer,
Esempio n. 20
0
# reshape to [batch x seq_size x feature_shape[1]]
lookup_to_seq = tf.reshape(lookup.tensor, [-1, seq_size, embed_dim])

# type of rnn cell
cell = tf.nn.rnn_cell.LSTMCell(num_units=n_hidden, state_is_tuple=True)
val, state = tf.nn.dynamic_rnn(cell, lookup_to_seq, dtype=tf.float32)

val = tf.transpose(val, [1, 0, 2])

# last = tf.gather(val, int(val.get_shape()[0]) - 1)
last = val[-1]

lstm_out = tx.TensorLayer(last, n_hidden)
logits = tx.Linear(lstm_out, vocab_size, bias=True)
out = tx.Activation(logits, tx.softmax)

labels = tx.dense_one_hot(loss_inputs.tensor, vocab_size)
loss = tf.reduce_mean(tx.categorical_cross_entropy(labels=labels, logits=logits.tensor))

# setup optimizer
optimizer = tx.AMSGrad(learning_rate=0.01)

model = tx.Model(run_inputs=in_layer, run_outputs=out,
                 train_inputs=in_layer, train_outputs=out,
                 train_in_loss=loss_inputs, train_out_loss=loss,
                 eval_out_score=loss, eval_in_score=loss_inputs)

print(model.feedable_train())

runner = tx.ModelRunner(model)
Esempio n. 21
0
    def __init__(self,
                 ctx_size,
                 vocab_size,
                 embed_dim,
                 embed_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 logit_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 embed_share=True,
                 use_gate=True,
                 use_hidden=False,
                 h_dim=100,
                 h_activation=tx.elu,
                 h_init=tx.he_normal_init(),
                 h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 use_dropout=True,
                 embed_dropout=False,
                 keep_prob=0.95,
                 l2_loss=False,
                 l2_loss_coef=1e-5,
                 use_nce=False,
                 nce_samples=100):

        # GRAPH INPUTS
        run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input")
        loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target")
        eval_inputs = loss_inputs

        # RUN GRAPH
        # if I create a scope here the Tensorboard graph will be a mess to read
        # because it groups everything by nested scope names
        # instead if I choose to create different scopes for train and eval only
        # the graph stays readable because it allows us to use the same names
        # under different scopes while still sharing variables
        var_reg = []
        with tf.name_scope("run"):
            feature_lookup = tx.Lookup(run_inputs,
                                       ctx_size, [vocab_size, embed_dim],
                                       embed_init,
                                       name="lookup")
            var_reg.append(feature_lookup.weights)
            feature_lookup = feature_lookup.as_concat()

            if use_gate or use_hidden:
                hl = tx.Linear(feature_lookup,
                               h_dim,
                               h_init,
                               bias=True,
                               name="h_linear")
                ha = tx.Activation(hl, h_activation, name="h_activation")
                h = tx.Compose(hl, ha, name="hidden")
                var_reg.append(hl.weights)

            features = feature_lookup
            if use_gate:
                gate_w = tx.Linear(h, ctx_size, bias=True)
                gate = tx.Gate(features, gate_input=gate_w)

                # gate = tx.Module([h, features], gate)

                features = gate
                var_reg.append(gate_w.weights)

            x_to_f = tx.Linear(features,
                               embed_dim,
                               x_to_f_init,
                               bias=True,
                               name="x_to_f")
            var_reg.append(x_to_f.weights)
            f_prediction = x_to_f

            if use_hidden:
                h_to_f = tx.Linear(h,
                                   embed_dim,
                                   h_to_f_init,
                                   bias=True,
                                   name="h_to_f")
                var_reg.append(h_to_f.weights)
                f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted")

            # RI DECODING ===============================================
            shared_weights = tf.transpose(
                feature_lookup.weights) if embed_share else None
            logit_init = logit_init if not embed_share else None
            run_logits = tx.Linear(f_prediction,
                                   vocab_size,
                                   logit_init,
                                   shared_weights,
                                   bias=True,
                                   name="logits")
            if not embed_share:
                var_reg.append(run_logits.weights)
            y_prob = tx.Activation(run_logits, tx.softmax)

        # TRAIN GRAPH ===============================================
        with tf.name_scope("train"):
            if use_dropout and embed_dropout:
                feature_lookup = feature_lookup.reuse_with(run_inputs)
                features = tx.Dropout(feature_lookup, probability=keep_prob)
            else:
                features = feature_lookup

            if use_gate or use_hidden:
                if use_dropout:
                    h = h.reuse_with(features)
                    h = tx.Dropout(h, probability=keep_prob)

                if use_gate:
                    gate_w = gate_w.reuse_with(h)
                    features = gate.reuse_with(layer=features,
                                               gate_input=gate_w)

                f_prediction = x_to_f.reuse_with(features)

                if use_hidden:
                    h_to_f = h_to_f.reuse_with(h)
                    if use_dropout:
                        h_to_f = tx.Dropout(h_to_f, probability=keep_prob)
                    f_prediction = tx.Add(f_prediction, h_to_f)
            else:
                f_prediction = f_prediction.reuse_with(features)

            train_logits = run_logits.reuse_with(f_prediction)

            if use_nce:
                # uniform gets good enough results if enough samples are used
                # but we can load the empirical unigram distribution
                # or learn the unigram distribution during training
                sampled_values = uniform_sampler(loss_inputs.tensor, 1,
                                                 nce_samples, True, vocab_size)
                train_loss = tf.nn.nce_loss(weights=tf.transpose(
                    train_logits.weights),
                                            biases=train_logits.bias,
                                            inputs=f_prediction.tensor,
                                            labels=loss_inputs.tensor,
                                            num_sampled=nce_samples,
                                            num_classes=vocab_size,
                                            num_true=1,
                                            sampled_values=sampled_values)
            else:
                one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor,
                                           num_cols=vocab_size)
                train_loss = tx.categorical_cross_entropy(
                    one_hot, train_logits.tensor)

            train_loss = tf.reduce_mean(train_loss)

            if l2_loss:
                losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = train_loss + l2_loss_coef * tf.add_n(losses)

        # EVAL GRAPH ===============================================
        with tf.name_scope("eval"):
            one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor,
                                       num_cols=vocab_size)
            eval_loss = tx.categorical_cross_entropy(one_hot,
                                                     run_logits.tensor)
            eval_loss = tf.reduce_mean(eval_loss)

        # SETUP MODEL CONTAINER ====================================
        super().__init__(run_inputs=run_inputs,
                         run_outputs=y_prob,
                         train_inputs=run_inputs,
                         train_outputs=y_prob,
                         eval_inputs=run_inputs,
                         eval_outputs=y_prob,
                         train_out_loss=train_loss,
                         train_in_loss=loss_inputs,
                         eval_out_score=eval_loss,
                         eval_in_score=eval_inputs)
Esempio n. 22
0
all_embeddings = tx.Linear(ri_layer,
                           embed_size,
                           shared_weights=lookup.weights,
                           name="all_features",
                           bias=False)

# dot product of f_predicted . all_embeddings with bias for each target word
run_logits = tx.Linear(feature_predict,
                       vocab_size,
                       shared_weights=all_embeddings.tensor,
                       transpose_weights=True,
                       bias=False,
                       name="logits")

embed_prob = tx.Activation(run_logits, tx.softmax, name="run_output")

one_hot = tx.dense_one_hot(column_indices=input_labels.tensor,
                           num_cols=vocab_size)
val_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor)
val_loss = tf.reduce_mean(val_loss)

# *************************************
#   Testing adaptive noise
# *************************************
#TODO I need to test the infinite vocab scenario where we try to generate
#RIs directly we can use sparsemax in that case
noise_logits = tx.Linear(lookup, vocab_size, bias=True)
adaptive_noise = tx.Activation(noise_logits, tx.softmax)

# adaptive_noise = tx.sample_sigmoid_from_logits(noise_logits.tensor, n=1)
Esempio n. 23
0
    def __init__(self,
                 ctx_size,
                 vocab_size,
                 k_dim,
                 s_active,
                 ri_tensor,
                 embed_dim,
                 h_dim,
                 embed_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 logit_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 num_h=1,
                 h_activation=tx.relu,
                 h_init=tx.he_normal_init,
                 use_dropout=False,
                 embed_dropout=False,
                 keep_prob=0.95,
                 l2_loss=False,
                 l2_loss_coef=1e-5,
                 f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 embed_share=True,
                 logit_bias=False,
                 use_nce=False,
                 nce_samples=100,
                 noise_level=0.1):

        run_inputs = tx.Input(ctx_size, dtype=tf.int32)
        loss_inputs = tx.Input(n_units=1, dtype=tf.int64)
        eval_inputs = loss_inputs

        if run_inputs.dtype != tf.int32 and run_inputs.dtype != tf.int64:
            raise TypeError(
                "Invalid dtype for input: expected int32 or int64, got {}".
                format(run_inputs.dtype))

        if num_h < 0:
            raise ValueError("num hidden should be >= 0")

        # ===============================================
        # RUN GRAPH
        # ===============================================
        var_reg = []

        with tf.name_scope("run"):
            # RI ENCODING ===============================================
            # convert ids to ris gather a set of random indexes based on the ids in a sequence
            # ri_layer = tx.TensorLayer(ri_tensor, n_units=k_dim)
            # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor)
            # ri_inputs = tx.TensorLayer(ri_inputs, n_units=k_dim)
            with tf.name_scope("ri_encode"):
                if isinstance(ri_tensor, RandomIndexTensor):
                    ri_tensor = ri_tensor
                    ri_layer = tx.TensorLayer(ri_tensor.to_sparse_tensor(),
                                              k_dim,
                                              shape=[vocab_size, k_dim])

                    ri_inputs = ri_tensor.gather(run_inputs.tensor)
                    ri_inputs = ri_inputs.to_sparse_tensor()
                    ri_inputs = tx.TensorLayer(
                        ri_inputs,
                        k_dim,
                        shape=[ri_inputs.get_shape()[0], k_dim])
                # ri_tensor is a sparse tensor
                else:
                    raise TypeError(
                        "please supply RandomIndexTensor instead of sparse Tensor"
                    )
                    # ri_layer = tx.TensorLayer(ri_tensor, k_dim)
                    # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor)
                    # ri_inputs = tx.TensorLayer(ri_inputs, k_dim)

            feature_lookup = tx.Lookup(ri_inputs,
                                       ctx_size, [k_dim, embed_dim],
                                       embed_init,
                                       name="lookup")
            self.embeddings = feature_lookup
            var_reg.append(feature_lookup.weights)
            feature_lookup = feature_lookup.as_concat()
            # ===========================================================

            last_layer = feature_lookup
            h_layers = []
            for i in range(num_h):
                h_i = tx.Linear(last_layer,
                                h_dim,
                                h_init,
                                bias=True,
                                name="h_{i}_linear".format(i=i))
                h_a = tx.Activation(h_i, h_activation)
                h = tx.Compose(h_i, h_a, name="h_{i}".format(i=i))
                h_layers.append(h)
                last_layer = h
                var_reg.append(h_i.weights)

            self.h_layers = h_layers

            # feature prediction for Energy-Based Model

            f_prediction = tx.Linear(last_layer,
                                     embed_dim,
                                     f_init,
                                     bias=True,
                                     name="f_predict")
            var_reg.append(f_prediction.weights)

            # RI DECODING ===============================================

            # Shared Embeddings
            if embed_share:
                shared_weights = feature_lookup.weights if embed_share else None
                logit_init = logit_init if not embed_share else None

                # ri_dense = tx.ToDense(ri_layer)
                all_embeddings = tx.Linear(ri_layer,
                                           embed_dim,
                                           logit_init,
                                           shared_weights,
                                           name="all_features",
                                           bias=False)

                # dot product of f_predicted . all_embeddings with bias for each target word
                run_logits = tx.Linear(f_prediction,
                                       vocab_size,
                                       shared_weights=all_embeddings.tensor,
                                       transpose_weights=True,
                                       bias=logit_bias,
                                       name="logits")
            else:
                run_logits = tx.Linear(f_prediction,
                                       vocab_size,
                                       bias=logit_bias,
                                       name="logits")

            if not embed_share:
                var_reg.append(run_logits.weights)
            # ===========================================================

            embed_prob = tx.Activation(run_logits,
                                       tx.softmax,
                                       name="run_output")

        # ===============================================
        # TRAIN GRAPH
        # ===============================================
        with tf.name_scope("train"):
            if use_dropout and embed_dropout:
                feature_lookup = feature_lookup.reuse_with(ri_inputs)
                last_layer = tx.Dropout(feature_lookup, probability=keep_prob)
            else:
                last_layer = feature_lookup

            # add dropout between each layer
            for layer in h_layers:
                h = layer.reuse_with(last_layer)
                if use_dropout:
                    h = tx.Dropout(h, probability=keep_prob)
                last_layer = h

            f_prediction = f_prediction.reuse_with(last_layer)

            train_logits = run_logits.reuse_with(f_prediction,
                                                 name="train_logits")
            train_embed_prob = tx.Activation(train_logits,
                                             tx.softmax,
                                             name="train_output")

            if use_nce:
                # labels
                labels = loss_inputs.tensor

                #  convert labels to random indices
                def labels_to_ri(x):
                    random_index_tensor = ri_tensor.gather(x)
                    sp_features = random_index_tensor.to_sparse_tensor()
                    return sp_features

                model_prediction = f_prediction.tensor

                train_loss = tx.sparse_cnce_loss(
                    label_features=labels,
                    model_prediction=model_prediction,
                    weights=feature_lookup.weights,
                    noise_ratio=noise_level,
                    num_samples=nce_samples,
                    labels_to_sparse_features=labels_to_ri)

            else:
                one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor,
                                           num_cols=vocab_size)
                train_loss = tx.categorical_cross_entropy(
                    one_hot, train_logits.tensor)

                train_loss = tf.reduce_mean(train_loss)

            if l2_loss:
                losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = train_loss + l2_loss_coef * tf.add_n(losses)

        # ===============================================
        # EVAL GRAPH
        # ===============================================
        with tf.name_scope("eval"):
            one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor,
                                       num_cols=vocab_size)
            eval_loss = tx.categorical_cross_entropy(one_hot,
                                                     run_logits.tensor)
            eval_loss = tf.reduce_mean(eval_loss)

        # BUILD MODEL
        super().__init__(run_inputs=run_inputs,
                         run_outputs=embed_prob,
                         train_inputs=run_inputs,
                         train_outputs=train_embed_prob,
                         eval_inputs=run_inputs,
                         eval_outputs=embed_prob,
                         train_out_loss=train_loss,
                         train_in_loss=loss_inputs,
                         eval_out_score=eval_loss,
                         eval_in_score=eval_inputs)
Esempio n. 24
0
    def __init__(self,
                 inputs,
                 labels,
                 vocab_size,
                 embed_dim,
                 h_dim,
                 embed_init=tx.zeros_init(),
                 logit_init=tx.glorot_uniform(),
                 num_h=1,
                 h_activation=tx.tanh,
                 h_init=tx.glorot_uniform(),
                 w_dropconnect=None,
                 u_dropconnect=None,
                 r_dropout=0.4,
                 y_dropout=0.4,
                 embed_dropout=0.3,
                 other_dropout=0.3,
                 l2_loss=False,
                 l2_weight=1e-5,
                 use_f_predict=False,
                 f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 embed_share=False,
                 logit_bias=False,
                 use_nce=False,
                 nce_samples=10,
                 skip_connections=False):
        if not isinstance(inputs, tx.Input):
            raise TypeError("inputs must be an Input layer")
        self.inputs = inputs
        self.labels = labels
        if not isinstance(labels, tx.Input):
            raise TypeError("labels must be an Input layer")

        if inputs.dtype != tf.int32 and inputs.dtype != tf.int64:
            raise TypeError(
                "Invalid dtype for input: expected int32 or int64, got {}".
                format(inputs.dtype))

        if num_h < 0:
            raise ValueError("num hidden should be >= 0")

        # ===============================================
        # RUN GRAPH
        # ===============================================
        var_reg = []

        with tf.name_scope("run"):
            # feature lookup

            embeddings = tx.Lookup(inputs,
                                   seq_size=None,
                                   lookup_shape=[vocab_size, embed_dim],
                                   weight_init=embed_init)
            var_reg.append(embeddings.weights)
            feature_lookup = embeddings.permute_batch_time()

            last_layer = feature_lookup

            cell_proto = tx.LSTMCell.proto(
                n_units=h_dim,
                activation=h_activation,
                gate_activation=tx.hard_sigmoid,
                w_init=h_init,
                u_init=h_init,
                w_dropconnect=w_dropconnect,
                u_dropconnect=u_dropconnect,
                r_dropout=r_dropout,
                x_dropout=None,
                y_dropout=y_dropout,
                regularized=False,
                name="cell",
            )

            lstm_layers = []
            for i in range(num_h):
                lstm_layer = tx.RNN(last_layer,
                                    cell_proto=cell_proto,
                                    regularized=False,
                                    stateful=True,
                                    name="LSTM_{}".format(i + 1))

                lstm_layers.append(lstm_layer)

                var_reg += [wi.weights for wi in lstm_layer.cell.w]
                var_reg += [ui.weights for ui in lstm_layer.cell.u]

                last_layer = lstm_layer

            # last time step is the state used to make the prediction
            # last_layer = tx.Reshape(last_layer, [-1, h_dim])

            # TODO this is not consistent with locked dropout for the last layer
            # where the same mask should be applied across time steps
            # to do this I need either y_dropout to be available or some sort of map
            # operation I can use with layers outputting 3D tensors
            # something equivalent to https://keras.io/layers/wrappers/ which applies
            # a layer to every temporal slice of an input. They implement this the same way
            # they implement an RNN

            # feature prediction for Energy-Based Model
            if use_f_predict:
                last_layer = tx.Linear(last_layer,
                                       embed_dim,
                                       f_init,
                                       add_bias=True,
                                       name="f_predict")
                # proto = tx.GRUCell.proto(n_units=embed_dim,
                #                          activation=h_activation,
                #                          gate_activation=tx.hard_sigmoid,
                #                          w_init=h_init,
                #                          u_init=h_init,
                #                          w_dropconnect=w_dropconnect,
                #                          u_dropconnect=u_dropconnect,
                #                          r_dropout=r_dropout,
                #                          x_dropout=None,
                #                          y_dropout=y_dropout,
                #                          regularized=False)
                # last_layer1 = tx.RNN(last_layer, cell_proto=proto, regularized=False, stateful=False)
                # last_layer2 = last_layer1.reuse_with(last_layer, reverse=True)
                # last_layer = tx.Add(last_layer1, last_layer2)
                # last_layer = tx.Module(last_layer, last_layer)
                var_reg += last_layer.variables
                # var_reg.append(last_layer.weights)
                f_predict = last_layer

            shared_weights = feature_lookup.weights if embed_share else None
            transpose_weights = embed_share
            logit_init = logit_init if not embed_share else None
            run_logits = tx.Linear(last_layer,
                                   n_units=vocab_size,
                                   weight_init=logit_init,
                                   shared_weights=shared_weights,
                                   transpose_weights=transpose_weights,
                                   add_bias=logit_bias,
                                   name="logits")

            if not embed_share:
                var_reg.append(run_logits.weights)

            run_output = tx.Activation(run_logits,
                                       tx.softmax,
                                       name="run_output")

            # ===============================================
            # TRAIN GRAPH
            # ===============================================
            with tf.name_scope("train"):
                embeddings = embeddings.reuse_with(inputs)
                feature_lookup = embeddings.permute_batch_time()

                if embed_dropout:
                    feature_lookup = tx.Dropout(feature_lookup,
                                                probability=embed_dropout,
                                                name="drop_features")

                last_layer = feature_lookup

                for i in range(num_h):
                    lstm_layer = lstm_layers[i].reuse_with(last_layer,
                                                           regularized=True)
                    last_layer = lstm_layer

                # last_layer = tx.Reshape(last_layer, [-1, h_dim])

                # feature prediction for Energy-Based Model
                if use_f_predict:
                    # last_layer = f_predict.reuse_with(last_layer)
                    last_layer = f_predict.reuse_with(last_layer,
                                                      regularized=True)

                last_layer = tx.Dropout(last_layer,
                                        probability=other_dropout,
                                        locked=False)

                train_logits = run_logits.reuse_with(last_layer,
                                                     name="train_logits")

                train_output = tx.Activation(train_logits,
                                             tx.softmax,
                                             name="run_output")

            def categorical_loss(labels, logits):
                # labels come as a batch of classes [[1,2],[3,4]] -> [1,3,2,4] time steps are ordered to match logits
                labels = tx.Transpose(labels)
                labels = tx.Reshape(labels, [-1])
                labels = tx.dense_one_hot(labels, num_cols=vocab_size)
                loss = tx.categorical_cross_entropy(labels=labels,
                                                    logits=logits)

                return tf.reduce_mean(loss)

            def nce_loss(labels, weights, bias, predict):
                noise = uniform_sampler(labels, 1, nce_samples, True,
                                        vocab_size)
                loss = tf.nn.nce_loss(weights=weights,
                                      biases=bias,
                                      inputs=predict,
                                      labels=labels,
                                      num_sampled=nce_samples,
                                      num_classes=vocab_size,
                                      num_true=1,
                                      sampled_values=noise)
                return tf.reduce_mean(loss)

            if use_nce:
                bias = tx.VariableLayer(var_shape=[vocab_size],
                                        name="nce_bias")

                # wraps a layer to expose the weights as a layer but with the layer as its input
                nce_weights = tx.WrapLayer(embeddings,
                                           n_units=embeddings.n_units,
                                           wrap_fn=lambda x: x.weights,
                                           layer_fn=True)
                train_loss = tx.LambdaLayer(labels,
                                            nce_weights,
                                            bias,
                                            last_layer,
                                            apply_fn=nce_loss,
                                            name="nce_loss")
            else:
                train_loss = tx.LambdaLayer(labels,
                                            train_logits,
                                            apply_fn=categorical_loss,
                                            name="train_loss")

            if l2_loss:
                l2_losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = tx.LambdaLayer(
                    train_loss,
                    apply_fn=lambda x: x + l2_weight * tf.add_n(l2_losses),
                    name="train_loss_l2")

        # ===============================================
        # EVAL GRAPH
        # ===============================================
        with tf.name_scope("eval"):
            eval_loss = tx.LambdaLayer(labels,
                                       run_logits,
                                       apply_fn=categorical_loss,
                                       name="eval_loss")

        self.stateful_layers = lstm_layers
        # BUILD MODEL
        super().__init__(run_outputs=run_output,
                         run_inputs=inputs,
                         train_inputs=[inputs, labels],
                         train_outputs=train_output,
                         train_loss=train_loss,
                         eval_inputs=[inputs, labels],
                         eval_outputs=run_output,
                         eval_score=eval_loss)
Esempio n. 25
0
    def __init__(
        self,
        run_inputs,
        label_inputs,
        eval_label_input,
        ctx_size,
        k_dim,
        ri_tensor_input,
        embed_dim,
        h_dim,
        embed_init=tx.random_uniform(minval=-0.01, maxval=0.01),
        num_h=1,
        h_activation=tx.relu,
        h_init=tx.he_normal_init,
        use_dropout=False,
        embed_dropout=False,
        keep_prob=0.95,
        l2_loss=False,
        l2_loss_coef=1e-5,
        f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
        use_nce=False,
        nce_samples=2,
        nce_noise_amount=0.1,
        noise_input=None,
    ):

        self.embed_dim = embed_dim

        var_reg = []

        # ===============================================
        # RUN GRAPH
        # ===============================================

        with tf.name_scope("run"):

            feature_lookup = tx.Lookup(run_inputs,
                                       seq_size=ctx_size,
                                       lookup_shape=[k_dim, embed_dim],
                                       weight_init=embed_init,
                                       name="lookup")

            self.embeddings = feature_lookup
            var_reg.append(feature_lookup.weights)
            feature_lookup = feature_lookup.as_concat()
            # ===========================================================
            with tf.name_scope("cache_embeddings"):
                # ris = [sign_index.get_ri(sign_index.get_sign(i)) for i in range(len(sign_index))]
                # self.all_ris = ris_to_sp_tensor_value(ri_seq=ris,
                #                                      dim=sign_index.generator.dim,
                #                                      all_positive=not sign_index.generator.symmetric)

                all_embeddings = tx.Linear(
                    ri_tensor_input,
                    n_units=self.embed_dim,
                    shared_weights=self.embeddings.weights,
                    bias=False,
                    name='all_features')

                # caches all embedding computation for run/eval
                self.all_embeddings = tx.VariableLayer(all_embeddings,
                                                       trainable=False)
            # ===========================================================
            last_layer = feature_lookup
            h_layers = []
            for i in range(num_h):
                hi = tx.FC(last_layer,
                           n_units=h_dim,
                           activation=h_activation,
                           weight_init=h_init,
                           name="h_{i}".format(i=i))
                h_layers.append(hi)
                last_layer = hi
                var_reg.append(hi.linear.weights)

            self.h_layers = h_layers

            # feature prediction for Energy-Based Model

            f_prediction = tx.Linear(last_layer,
                                     embed_dim,
                                     f_init,
                                     bias=True,
                                     name="f_predict")
            var_reg.append(f_prediction.weights)

            # RI DECODING ===============================================
            # shape is (?,?) because batch size is unknown and vocab size is unknown
            # when we build the graph
            run_logits = tx.Linear(f_prediction,
                                   n_units=None,
                                   shared_weights=self.all_embeddings.variable,
                                   transpose_weights=True,
                                   bias=False,
                                   name="logits")

            # ===========================================================
            embed_prob = tx.Activation(run_logits,
                                       tx.softmax,
                                       name="run_output")

        # ===============================================
        # TRAIN GRAPH
        # ===============================================
        with tf.name_scope("train"):
            if use_dropout and embed_dropout:
                feature_lookup = feature_lookup.reuse_with(run_inputs)
                last_layer = tx.Dropout(feature_lookup, probability=keep_prob)
            else:
                last_layer = feature_lookup

            # add dropout between each layer
            for layer in h_layers:
                h = layer.reuse_with(last_layer)
                if use_dropout:
                    h = tx.Dropout(h, probability=keep_prob)
                last_layer = h

            f_prediction = f_prediction.reuse_with(last_layer)

            train_logits = run_logits.reuse_with(f_prediction,
                                                 name="train_logits")
            train_embed_prob = tx.Activation(train_logits,
                                             tx.softmax,
                                             name="train_output")

            #  convert labels to random indices
            model_prediction = f_prediction.tensor

            if use_nce:
                train_loss = tx.sparse_cnce_loss(
                    label_features=label_inputs.tensor,
                    noise_features=noise_input.tensor,
                    model_prediction=model_prediction,
                    weights=feature_lookup.weights,
                    num_samples=nce_samples,
                    noise_ratio=nce_noise_amount)
            else:
                one_hot_dense = tx.dense_one_hot(
                    column_indices=label_inputs[0].tensor,
                    num_cols=label_inputs[1].tensor)
                train_loss = tx.categorical_cross_entropy(
                    one_hot_dense, train_logits.tensor)

                train_loss = tf.reduce_mean(train_loss)

            if l2_loss:
                losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = train_loss + l2_loss_coef * tf.add_n(losses)

        # ===============================================
        # EVAL GRAPH
        # ===============================================
        with tf.name_scope("eval"):
            one_hot_dense = tx.dense_one_hot(
                column_indices=eval_label_input[0].tensor,
                num_cols=label_inputs[1].tensor)
            train_loss = tx.categorical_cross_entropy(one_hot_dense,
                                                      train_logits.tensor)
            eval_loss = tx.categorical_cross_entropy(one_hot_dense,
                                                     run_logits.tensor)
            eval_loss = tf.reduce_mean(eval_loss)

        if use_nce:
            train_loss_in = [label_inputs, noise_input]
        else:
            train_loss_in = label_inputs

        # BUILD MODEL
        super().__init__(run_inputs=run_inputs,
                         run_outputs=embed_prob,
                         train_inputs=run_inputs,
                         train_outputs=train_embed_prob,
                         eval_inputs=run_inputs,
                         eval_outputs=embed_prob,
                         train_out_loss=train_loss,
                         train_in_loss=train_loss_in,
                         eval_out_score=eval_loss,
                         eval_in_score=eval_label_input,
                         update_inputs=ri_tensor_input)
Esempio n. 26
0
v_dim = 1000
m_dim = 2
n_hidden = 100
seq_size = 2

w = [[0, 1], [1, 5], [0, 1]]
v2 = tf.constant(np.random.uniform(-1., 1., [v_dim, m_dim]))

inputs = tx.Input(2, dtype=tf.int32)

lookup = tx.Lookup(inputs, 2, lookup_shape=[v_dim, m_dim])

# GATING MECHANISM
# I can call this a seq gate, takes the parameters and divides by seq_size
h = tx.Linear(lookup, 100, bias=True)
h = tx.Activation(h, tx.elu)

gate = tx.Linear(h, 2, bias=True)
gate = tx.Activation(gate, tx.sigmoid)

# lookup might output a sequence format with [batch,seq_size,m_dim]
# lookup_out = lookup.tensor
lookup_out = tf.reshape(lookup.tensor, [-1, seq_size, m_dim])

# reshape works anyway
gated_out = tf.reshape(lookup_out, [-1, seq_size, m_dim]) * tf.expand_dims(
    gate.tensor, -1)

# gated_out = tf.reshape(gated_out, [-1, seq_size * m_dim])
# gated_out = tf.reshape(gated_out, [-1, lookup.n_units])
gated_out = tf.reshape(gated_out, tf.shape(lookup.tensor))