Exemple #1
0
def test_gather_sparse():
    v = tf.constant([[1, 0, 1], [0, 0, 2], [3, 0, 3]], tf.float32)
    sp = tx.to_sparse(v)

    indices = np.array([[0, 1], [2, 0]], dtype=np.int64)

    gather_sp = tx.gather_sparse(sp, indices)
    gather = tf.sparse.to_dense(gather_sp)
    expected = tf.constant([[1., 0., 1.], [0., 0., 2.], [3., 0., 3.],
                            [1., 0, 1.]])

    assert tx.tensor_equal(gather, expected)
embed_dim = 4
batch_size = 2
generator = Generator(k, s)

print([vocab[w] for w in vocab.keys()])
ri_dict = {vocab[word]: generator.generate() for word in vocab.keys()}

tokens = [vocab[w] for w in tokens]
data_it = window_it(tokens, seq_size)
data_it = batch_it(data_it, batch_size)

vocab_tensor = [ri_dict[i] for i in range(len(vocab))]
sp_ri = deepsign.data.transform.ris_to_sp_tensor_value(vocab_tensor, dim=k)

inputs = tx.Input(n_units=2)
ri_inputs = tx.gather_sparse(sp_ri, inputs.tensor)
ri_inputs = tx.TensorLayer(ri_inputs, k)

embed = tx.Lookup(ri_inputs, seq_size, [k, embed_dim])

# logits: take the embeddings and get the features for all random indexes

ri_layer = tx.TensorLayer(sp_ri, n_units=k)
logits = tx.Linear(input_layer=ri_layer,
                   n_units=embed_dim,
                   shared_weights=embed.weights,
                   bias=True)

single_input = tx.Input(1)
ri_input = tx.TensorLayer(tx.gather_sparse(sp_ri, single_input.tensor), k)
Exemple #3
0
def _compute_random_ri_sampled_logits(ri_tensors,
                                      k_dim,
                                      s_active,
                                      weights,
                                      labels,
                                      inputs,
                                      num_sampled,
                                      num_true=1,
                                      subtract_log_q=True,
                                      partition_strategy="mod",
                                      name=None,
                                      seed=None):
    """ Random Random Index Sampled Logits with negative sampling

    https://arxiv.org/pdf/1410.8251.pdf

    Computes the sampled logits from the space of all possible random indexes.
    Since any random index is possible, we sample, not from the existing random indexes
    but from the space of possible random indexes so that the model learns which combinations
    of bases are NOT the ones used to predict a given feature.

    Args:
        ri_tensors:
        k_dim:
        s_active:
        weights:
        labels:
        inputs:
        num_sampled:
        sampled_values:
        num_true:
        subtract_log_q:
        remove_accidental_hits:
        partition_strategy:
        name:
        seed:

    Returns:

    """
    if isinstance(weights, variables.PartitionedVariable):
        weights = list(weights)
    if not isinstance(weights, list):
        weights = [weights]

    with ops.name_scope(name, "random_ri_sampled_logits",
                        weights + [inputs, labels]):
        if labels.dtype != dtypes.int64:
            labels = math_ops.cast(labels, dtypes.int64)
        labels_flat = array_ops.reshape(labels, [-1])

        true_ris = tx.gather_sparse(sp_tensor=ri_tensors, ids=labels_flat)
        sampled_ris, expected_true_ris, expected_sampled_ris = sample_ri(k_dim, s_active, num_sampled, true_ris)

        all_ris = sparse_ops.sparse_concat(axis=0, sp_inputs=[true_ris, sampled_ris])

        sp_values = all_ris
        sp_indices = tx.sparse_indices(sp_values)

        # Retrieve the weights

        # weights shape is [num_classes, dim]
        all_w = embedding_lookup_sparse(
            weights, sp_indices, sp_values, combiner="sum", partition_strategy=partition_strategy)

        # true_w shape is [batch_size * num_true, dim]
        true_w = array_ops.slice(all_w, [0, 0],
                                 array_ops.stack(
                                     [array_ops.shape(labels_flat)[0], -1]))

        sampled_w = array_ops.slice(
            all_w, array_ops.stack([array_ops.shape(labels_flat)[0], 0]), [-1, -1])
        # inputs has shape [batch_size, dim]
        # sampled_w has shape [num_sampled, dim]
        # Apply X*W', which yields [batch_size, num_sampled]
        sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b=True)

        dim = array_ops.shape(true_w)[1:2]
        new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0)
        row_wise_dots = math_ops.multiply(
            array_ops.expand_dims(inputs, 1),
            array_ops.reshape(true_w, new_true_w_shape))
        # We want the row-wise dot plus biases which yields a
        # [batch_size, num_true] tensor of true_logits.
        dots_as_matrix = array_ops.reshape(row_wise_dots,
                                           array_ops.concat([[-1], dim], 0))
        true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true])

        if subtract_log_q:
            # Subtract log of Q(l), prior probability that label appears in sampled.
            true_logits -= math_ops.log(expected_true_ris)
            sampled_logits -= math_ops.log(expected_sampled_ris)

        # Construct output logits and labels. The true labels/logits start at col 0.
        out_logits = array_ops.concat([true_logits, sampled_logits], 1)

        # true_logits is a float tensor, ones_like(true_logits) is a float
        # tensor of ones. We then divide by num_true to ensure the per-example
        # labels sum to 1.0, i.e. form a proper probability distribution.
        out_labels = array_ops.concat([
            array_ops.ones_like(true_logits) / num_true,
            array_ops.zeros_like(sampled_logits)
        ], 1)

        return out_logits, out_labels
Exemple #4
0
ri_tensor = tf.convert_to_tensor_or_sparse_tensor(ri_tensor)

# *************************************
#   DUMMY INPUT DATA
# *************************************
# batch of word sequence indices
ctx_size = 3
input_data = np.array([[0, 1, 2], [0, 2, 2], [1, 3, 5], [3, 0, 2]])

input_labels = tf.constant(np.array([[3], [1], [10], [25]], dtype=np.int64))
input_labels = tx.TensorLayer(input_labels, n_units=1)

input_layer = tx.TensorLayer(input_data, n_units=3, dtype=tf.int64)

ri_layer = tx.TensorLayer(ri_tensor, k)
ri_inputs = tx.gather_sparse(ri_layer.tensor, input_layer.tensor)
ri_inputs = tx.TensorLayer(ri_inputs, k)
lookup = tx.Lookup(ri_inputs,
                   ctx_size, [k, embed_size],
                   weight_init=tx.random_normal(0, 0.1),
                   name="lookup")
feature_predict = tx.Linear(lookup, embed_size, bias=True)

all_embeddings = tx.Linear(ri_layer,
                           embed_size,
                           shared_weights=lookup.weights,
                           name="all_features",
                           bias=False)

# dot product of f_predicted . all_embeddings with bias for each target word
run_logits = tx.Linear(feature_predict,
Exemple #5
0
def _compute_sampled_logits(ri_tensors,
                            weights,
                            bias,
                            labels,
                            partition_const,
                            inputs,
                            num_sampled,
                            num_classes,
                            num_true=1,
                            sampled_values=None,
                            subtract_log_q=True,
                            remove_accidental_hits=False,
                            partition_strategy="mod",
                            name=None,
                            seed=None):
    if isinstance(weights, variables.PartitionedVariable):
        weights = list(weights)
    if not isinstance(weights, list):
        weights = [weights]

    with ops.name_scope(name, "compute_sampled_logits",
                        weights + [inputs, labels]):
        if labels.dtype != dtypes.int64:
            labels = math_ops.cast(labels, dtypes.int64)
        labels_flat = array_ops.reshape(labels, [-1])

        # Sample the negative labels.
        #   sampled shape: [num_sampled] tensor
        #   true_expected_count shape = [batch_size, 1] tensor
        #   sampled_expected_count shape = [num_sampled] tensor
        if sampled_values is None:
            sampled_values = candidate_sampling_ops.uniform_candidate_sampler(
                true_classes=labels,
                num_true=num_true,
                num_sampled=num_sampled,
                unique=True,
                range_max=num_classes,
                seed=seed)
        # NOTE: pylint cannot tell that 'sampled_values' is a sequence
        # pylint: disable=unpacking-non-sequence
        sampled, true_expected_count, sampled_expected_count = (
            array_ops.stop_gradient(s) for s in sampled_values)
        # pylint: enable=unpacking-non-sequence
        sampled = math_ops.cast(sampled, dtypes.int64)

        # labels_flat is a [batch_size * num_true] tensor
        # sampled is a [num_sampled] int tensor
        all_ids = array_ops.concat([labels_flat, sampled], 0)

        # true_ris
        true_ris = tx.gather_sparse(sp_tensor=ri_tensors, ids=labels_flat)
        sampled_ris = tx.gather_sparse(sp_tensor=ri_tensors, ids=sampled)

        true_w = embedding_lookup_sparse(params=weights,
                                         sp_ids=tx.sparse_indices(true_ris),
                                         sp_weights=true_ris,
                                         combiner="sum",
                                         partition_strategy=partition_strategy)

        noise_w = embedding_lookup_sparse(params=weights,
                                          sp_ids=tx.sparse_indices(sampled_ris),
                                          sp_weights=sampled_ris,
                                          combiner="sum",
                                          partition_strategy=partition_strategy)

        if bias is not None:
            sampled_b = embedding_lookup_sparse(
                params=bias,
                sp_ids=tx.sparse_indices(sampled_ris),
                sp_weights=sampled_ris,
                combiner="sum",
                partition_strategy=partition_strategy)

            true_b = embedding_lookup_sparse(
                params=bias,
                sp_ids=tx.sparse_indices(true_ris),
                sp_weights=true_ris,
                combiner="sum",
                partition_strategy=partition_strategy)

        noise_logits = math_ops.matmul(inputs, noise_w, transpose_b=True)

        dim = array_ops.shape(true_w)[1:2]
        new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0)
        true_w_e = array_ops.reshape(true_w, new_true_w_shape)

        row_wise_dots = math_ops.multiply(array_ops.expand_dims(inputs, 1),
                                          true_w_e)
        # We want the row-wise dot plus biases which yields a
        # [batch_size, num_true] tensor of true_logits.
        dots_as_matrix = array_ops.reshape(row_wise_dots,
                                           array_ops.concat([[-1], dim], 0))
        true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true])

        if bias is not None:
            true_b = array_ops.reshape(true_b, [-1, num_true])
            true_logits += true_b
            noise_logits += sampled_b

        # TODO  need to review how to do this Z
        # true_logits = true_logits * math_ops.exp(partition_const)

        if remove_accidental_hits:
            acc_hits = candidate_sampling_ops.compute_accidental_hits(
                labels, sampled, num_true=num_true)
            acc_indices, acc_ids, acc_weights = acc_hits

            # This is how SparseToDense expects the indices.
            acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1])
            acc_ids_2d_int32 = array_ops.reshape(
                math_ops.cast(acc_ids, dtypes.int32), [-1, 1])
            sparse_indices = array_ops.concat([acc_indices_2d, acc_ids_2d_int32], 1,
                                              "sparse_indices")
            # Create sampled_logits_shape = [batch_size, num_sampled]
            sampled_logits_shape = array_ops.concat(
                [array_ops.shape(labels)[:1],
                 array_ops.expand_dims(num_sampled, 0)], 0)
            if noise_logits.dtype != acc_weights.dtype:
                acc_weights = math_ops.cast(acc_weights, noise_logits.dtype)
            noise_logits += sparse_ops.sparse_to_dense(
                sparse_indices,
                sampled_logits_shape,
                acc_weights,
                default_value=0.0,
                validate_indices=False)

        if subtract_log_q:
            # Subtract log of Q(l), prior probability that l appears in sampled.
            true_logits -= math_ops.log(true_expected_count)
            noise_logits -= math_ops.log(sampled_expected_count)

        # Construct output logits and labels. The true labels/logits start at col 0.
        out_logits = array_ops.concat([true_logits, noise_logits], 1)

        # true_logits is a float tensor, ones_like(true_logits) is a float
        # tensor of ones. We then divide by num_true to ensure the per-example
        # labels sum to 1.0, i.e. form a proper probability distribution.
        out_labels = array_ops.concat([
            array_ops.ones_like(true_logits) / num_true,
            array_ops.zeros_like(noise_logits)
        ], 1)

        # out_logits = math_ops.div(out_logits,math_ops.exp(partition_const))
        # out_logits = out_logits / (partition_const + 1)
        return out_logits, out_labels
Exemple #6
0
def _sampled_logits_from_parametric_noise(ri_tensors,
                                          k_dim,
                                          weights,
                                          labels,
                                          inputs,
                                          input_dim,
                                          num_true=1,
                                          partition_strategy="mod",
                                          name=None):
    if isinstance(weights, variables.PartitionedVariable):
        weights = list(weights)
    if not isinstance(weights, list):
        weights = [weights]

    with ops.name_scope(name, "compute_sampled_logits",
                        weights + [inputs, labels]):
        if labels.dtype != dtypes.int64:
            labels = math_ops.cast(labels, dtypes.int64)
        labels_flat = array_ops.reshape(labels, [-1])

        # true_ris
        true_ris = tx.gather_sparse(sp_tensor=ri_tensors, ids=labels_flat)

        true_w = embedding_lookup_sparse(params=weights,
                                         sp_ids=tx.sparse_indices(true_ris),
                                         sp_weights=true_ris,
                                         combiner="sum",
                                         partition_strategy=partition_strategy)

        label_layer = tx.TensorLayer(true_w, input_dim)
        noise_fn = tx.FC(label_layer, 512, activation=tx.relu)
        noise_fn_sp = tx.ToSparse(noise_fn)
        noise_ris = tx.Linear(noise_fn_sp, k_dim, weight_init=tx.glorot_uniform(), bias=True)
        noise_ris_sp = tx.ToSparse(noise_ris)

        noise_w = embedding_lookup_sparse(params=weights,
                                          sp_ids=tx.sparse_indices(noise_ris_sp.tensor),
                                          sp_weights=noise_ris_sp.tensor,
                                          combiner="sum",
                                          partition_strategy=partition_strategy)

        noise_logits = math_ops.matmul(inputs, noise_w, transpose_b=True)

        dim = array_ops.shape(true_w)[1:2]
        new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0)
        true_w_e = array_ops.reshape(true_w, new_true_w_shape)

        row_wise_dots = math_ops.multiply(array_ops.expand_dims(inputs, 1),
                                          true_w_e)
        # We want the row-wise dot plus biases which yields a
        # [batch_size, num_true] tensor of true_logits.
        dots_as_matrix = array_ops.reshape(row_wise_dots,
                                           array_ops.concat([[-1], dim], 0))
        true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true])

        # Construct output logits and labels. The true labels/logits start at col 0.
        out_logits = array_ops.concat([true_logits, noise_logits], 1)

        # true_logits is a float tensor, ones_like(true_logits) is a float
        # tensor of ones. We then divide by num_true to ensure the per-example
        # labels sum to 1.0, i.e. form a proper probability distribution.
        out_labels = array_ops.concat([
            array_ops.ones_like(true_logits) / num_true,
            array_ops.zeros_like(noise_logits)
        ], 1)

        # out_logits = out_logits * math_ops.exp(partition_const)
        # out_logits = out_logits / (partition_const + 1)
        return out_logits, out_labels
Exemple #7
0
num_rows = tf.shape(tf.reshape(indices, [-1]))[-1]
print("num rows \n", num_rows.eval())

column_ids = tf.gather(col_j, row_indices)
print(column_ids.eval())
values = tf.gather_nd(sp.values, row_indices)
# values = tf.reshape(values, [-1])
print("columns \n", column_ids.eval())
print("values \n", values.eval())

row_col = tf.concat([new_rows, column_ids], axis=-1)
print("new indices \n", row_col.eval())

# stack must be 0 to concat into rank 1 tensors
dense_shape = tf.stack([tf.cast(num_rows, tf.int64), sp.dense_shape[-1]])

print("i ", row_col)
print("v ", values)
print("s ", dense_shape)

gather_sp = tf.SparseTensor(indices=row_col, values=values, dense_shape=dense_shape)

dense = tf.sparse_tensor_to_dense(gather_sp)
print(dense.eval())
print(gather_sp.eval())


gather_sp_tx = tx.gather_sparse(sp, indices)

print(tf.sparse_tensor_to_dense(gather_sp_tx).eval())
Exemple #8
0
sampled_values = sampling_ops.uniform_candidate_sampler(
    true_classes=labels,
    num_true=num_true,
    num_sampled=num_samples,
    unique=True,
    range_max=vocab_size,
    seed=None)

sampled, true_expected_count, sampled_expected_count = (
    tf.stop_gradient(s) for s in sampled_values)
sampled = tf.cast(sampled, tf.int64)

all_ids = tf.concat([labels_flat, sampled], 0)

all_ris = tx.gather_sparse(ri_tensor, all_ids)

# Retrieve the true weights and the logits of the sampled weights.

# weights shape is [num_classes, dim]
ri_layer = tx.TensorLayer(ri_tensor, k)
l = tx.Linear(ri_layer, embed_size, weight_init=tx.random_normal(0, 1), bias=True)
weights = l.weights

sp_values = all_ris
sp_indices = tx.sparse_indices(sp_values)

all_w = tf.nn.embedding_lookup_sparse(
    weights, sp_indices, sp_values, combiner="sum")

tf.global_variables_initializer().run()
Exemple #9
0
    def __init__(self,
                 ctx_size,
                 vocab_size,
                 k_dim,
                 ri_tensor: RandomIndexTensor,
                 embed_dim,
                 embed_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 logit_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 embed_share=True,
                 logit_bias=False,
                 use_gate=True,
                 use_hidden=False,
                 h_dim=100,
                 h_activation=tx.elu,
                 h_init=tx.he_normal_init(),
                 h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 use_dropout=True,
                 embed_dropout=False,
                 keep_prob=0.95,
                 l2_loss=False,
                 l2_loss_coef=1e-5):

        # GRAPH INPUTS
        run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input")
        loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target")
        eval_inputs = loss_inputs

        # RUN GRAPH =====================================================
        var_reg = []
        with tf.name_scope("run"):
            # RI ENCODING ===============================================
            # convert ids to ris gather a set of random indexes based on the ids in a sequence

            # ri_layer = tx.TensorLayer(ri_tensor, n_units=k_dim)
            # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor)
            with tf.name_scope("ri_encode"):
                # used to compute logits
                if isinstance(ri_tensor, RandomIndexTensor):
                    ri_layer = tx.TensorLayer(ri_tensor.to_sparse_tensor(),
                                              k_dim)

                    ri_inputs = ri_tensor.gather(run_inputs.tensor)
                    ri_inputs = ri_inputs.to_sparse_tensor()
                    ri_inputs = tx.TensorLayer(ri_inputs, k_dim)
                else:
                    ri_layer = tx.TensorLayer(ri_tensor, k_dim)
                    ri_inputs = tx.gather_sparse(ri_layer.tensor,
                                                 run_inputs.tensor)
                    ri_inputs = tx.TensorLayer(ri_inputs, k_dim)

            # use those sparse indexes to lookup a set of features based on the ri values
            feature_lookup = tx.Lookup(ri_inputs,
                                       ctx_size, [k_dim, embed_dim],
                                       embed_init,
                                       name="lookup")
            var_reg.append(feature_lookup.weights)
            feature_lookup = feature_lookup.as_concat()
            # ===========================================================

            if use_gate or use_hidden:
                hl = tx.Linear(feature_lookup,
                               h_dim,
                               h_init,
                               bias=True,
                               name="h_linear")
                ha = tx.Activation(hl, h_activation, name="h_activation")
                h = tx.Compose(hl, ha, name="hidden")
                var_reg.append(hl.weights)

            features = feature_lookup
            if use_gate:
                features = tx.Gate(features, ctx_size, gate_input=h)
                gate = features
                var_reg.append(features.gate_weights)

            x_to_f = tx.Linear(features,
                               embed_dim,
                               x_to_f_init,
                               bias=True,
                               name="x_to_f")
            var_reg.append(x_to_f.weights)
            f_prediction = x_to_f

            if use_hidden:
                h_to_f = tx.Linear(h,
                                   embed_dim,
                                   h_to_f_init,
                                   bias=True,
                                   name="h_to_f")
                var_reg.append(h_to_f.weights)
                f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted")

            # RI DECODING ===============================================
            shared_weights = feature_lookup.weights if embed_share else None
            logit_init = logit_init if not embed_share else None
            # embedding feature vectors for all words: shape [vocab_size, embed_dim]
            # later, for NCE we don't need to get all the features

            all_embeddings = tx.Linear(ri_layer,
                                       embed_dim,
                                       logit_init,
                                       shared_weights,
                                       name="logits",
                                       bias=False)

            # dot product of f_predicted . all_embeddings with bias for each target word

            run_logits = tx.Linear(f_prediction,
                                   n_units=vocab_size,
                                   shared_weights=all_embeddings.tensor,
                                   transpose_weights=True,
                                   bias=logit_bias)

            if not embed_share:
                var_reg.append(all_embeddings.weights)

            # ===========================================================
            run_embed_prob = tx.Activation(run_logits, tx.softmax)

        # TRAIN GRAPH ===================================================
        with tf.name_scope("train"):
            if use_dropout and embed_dropout:
                feature_lookup = feature_lookup.reuse_with(ri_inputs)
                features = tx.Dropout(feature_lookup, probability=keep_prob)
            else:
                features = feature_lookup

            if use_gate or use_hidden:
                if use_dropout:
                    h = h.reuse_with(features)
                    h = tx.Dropout(h, probability=keep_prob)

                if use_gate:
                    features = gate.reuse_with(features, gate_input=h)

                f_prediction = x_to_f.reuse_with(features)

                if use_hidden:
                    h_to_f = h_to_f.reuse_with(h)
                    if use_dropout:
                        h_to_f = tx.Dropout(h_to_f, probability=keep_prob)
                    f_prediction = tx.Add(f_prediction, h_to_f)
            else:
                f_prediction = f_prediction.reuse_with(features)

            # we already define all_embeddings from which these logits are computed before so this should be ok
            train_logits = run_logits.reuse_with(f_prediction)

            train_embed_prob = tx.Activation(train_logits,
                                             tx.softmax,
                                             name="train_output")

            one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor,
                                       num_cols=vocab_size)
            train_loss = tx.categorical_cross_entropy(one_hot,
                                                      train_logits.tensor)

            train_loss = tf.reduce_mean(train_loss)

            if l2_loss:
                losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = train_loss + l2_loss_coef * tf.add_n(losses)

        # EVAL GRAPH ===============================================
        with tf.name_scope("eval"):
            one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor,
                                       num_cols=vocab_size)
            eval_loss = tx.categorical_cross_entropy(one_hot,
                                                     run_logits.tensor)
            eval_loss = tf.reduce_mean(eval_loss)

        # SETUP MODEL CONTAINER ====================================
        super().__init__(run_inputs=run_inputs,
                         run_outputs=run_embed_prob,
                         train_inputs=run_inputs,
                         train_outputs=train_embed_prob,
                         eval_inputs=run_inputs,
                         eval_outputs=run_embed_prob,
                         train_out_loss=train_loss,
                         train_in_loss=loss_inputs,
                         eval_out_score=eval_loss,
                         eval_in_score=eval_inputs)