def test_gather_sparse(): v = tf.constant([[1, 0, 1], [0, 0, 2], [3, 0, 3]], tf.float32) sp = tx.to_sparse(v) indices = np.array([[0, 1], [2, 0]], dtype=np.int64) gather_sp = tx.gather_sparse(sp, indices) gather = tf.sparse.to_dense(gather_sp) expected = tf.constant([[1., 0., 1.], [0., 0., 2.], [3., 0., 3.], [1., 0, 1.]]) assert tx.tensor_equal(gather, expected)
embed_dim = 4 batch_size = 2 generator = Generator(k, s) print([vocab[w] for w in vocab.keys()]) ri_dict = {vocab[word]: generator.generate() for word in vocab.keys()} tokens = [vocab[w] for w in tokens] data_it = window_it(tokens, seq_size) data_it = batch_it(data_it, batch_size) vocab_tensor = [ri_dict[i] for i in range(len(vocab))] sp_ri = deepsign.data.transform.ris_to_sp_tensor_value(vocab_tensor, dim=k) inputs = tx.Input(n_units=2) ri_inputs = tx.gather_sparse(sp_ri, inputs.tensor) ri_inputs = tx.TensorLayer(ri_inputs, k) embed = tx.Lookup(ri_inputs, seq_size, [k, embed_dim]) # logits: take the embeddings and get the features for all random indexes ri_layer = tx.TensorLayer(sp_ri, n_units=k) logits = tx.Linear(input_layer=ri_layer, n_units=embed_dim, shared_weights=embed.weights, bias=True) single_input = tx.Input(1) ri_input = tx.TensorLayer(tx.gather_sparse(sp_ri, single_input.tensor), k)
def _compute_random_ri_sampled_logits(ri_tensors, k_dim, s_active, weights, labels, inputs, num_sampled, num_true=1, subtract_log_q=True, partition_strategy="mod", name=None, seed=None): """ Random Random Index Sampled Logits with negative sampling https://arxiv.org/pdf/1410.8251.pdf Computes the sampled logits from the space of all possible random indexes. Since any random index is possible, we sample, not from the existing random indexes but from the space of possible random indexes so that the model learns which combinations of bases are NOT the ones used to predict a given feature. Args: ri_tensors: k_dim: s_active: weights: labels: inputs: num_sampled: sampled_values: num_true: subtract_log_q: remove_accidental_hits: partition_strategy: name: seed: Returns: """ if isinstance(weights, variables.PartitionedVariable): weights = list(weights) if not isinstance(weights, list): weights = [weights] with ops.name_scope(name, "random_ri_sampled_logits", weights + [inputs, labels]): if labels.dtype != dtypes.int64: labels = math_ops.cast(labels, dtypes.int64) labels_flat = array_ops.reshape(labels, [-1]) true_ris = tx.gather_sparse(sp_tensor=ri_tensors, ids=labels_flat) sampled_ris, expected_true_ris, expected_sampled_ris = sample_ri(k_dim, s_active, num_sampled, true_ris) all_ris = sparse_ops.sparse_concat(axis=0, sp_inputs=[true_ris, sampled_ris]) sp_values = all_ris sp_indices = tx.sparse_indices(sp_values) # Retrieve the weights # weights shape is [num_classes, dim] all_w = embedding_lookup_sparse( weights, sp_indices, sp_values, combiner="sum", partition_strategy=partition_strategy) # true_w shape is [batch_size * num_true, dim] true_w = array_ops.slice(all_w, [0, 0], array_ops.stack( [array_ops.shape(labels_flat)[0], -1])) sampled_w = array_ops.slice( all_w, array_ops.stack([array_ops.shape(labels_flat)[0], 0]), [-1, -1]) # inputs has shape [batch_size, dim] # sampled_w has shape [num_sampled, dim] # Apply X*W', which yields [batch_size, num_sampled] sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b=True) dim = array_ops.shape(true_w)[1:2] new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0) row_wise_dots = math_ops.multiply( array_ops.expand_dims(inputs, 1), array_ops.reshape(true_w, new_true_w_shape)) # We want the row-wise dot plus biases which yields a # [batch_size, num_true] tensor of true_logits. dots_as_matrix = array_ops.reshape(row_wise_dots, array_ops.concat([[-1], dim], 0)) true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true]) if subtract_log_q: # Subtract log of Q(l), prior probability that label appears in sampled. true_logits -= math_ops.log(expected_true_ris) sampled_logits -= math_ops.log(expected_sampled_ris) # Construct output logits and labels. The true labels/logits start at col 0. out_logits = array_ops.concat([true_logits, sampled_logits], 1) # true_logits is a float tensor, ones_like(true_logits) is a float # tensor of ones. We then divide by num_true to ensure the per-example # labels sum to 1.0, i.e. form a proper probability distribution. out_labels = array_ops.concat([ array_ops.ones_like(true_logits) / num_true, array_ops.zeros_like(sampled_logits) ], 1) return out_logits, out_labels
ri_tensor = tf.convert_to_tensor_or_sparse_tensor(ri_tensor) # ************************************* # DUMMY INPUT DATA # ************************************* # batch of word sequence indices ctx_size = 3 input_data = np.array([[0, 1, 2], [0, 2, 2], [1, 3, 5], [3, 0, 2]]) input_labels = tf.constant(np.array([[3], [1], [10], [25]], dtype=np.int64)) input_labels = tx.TensorLayer(input_labels, n_units=1) input_layer = tx.TensorLayer(input_data, n_units=3, dtype=tf.int64) ri_layer = tx.TensorLayer(ri_tensor, k) ri_inputs = tx.gather_sparse(ri_layer.tensor, input_layer.tensor) ri_inputs = tx.TensorLayer(ri_inputs, k) lookup = tx.Lookup(ri_inputs, ctx_size, [k, embed_size], weight_init=tx.random_normal(0, 0.1), name="lookup") feature_predict = tx.Linear(lookup, embed_size, bias=True) all_embeddings = tx.Linear(ri_layer, embed_size, shared_weights=lookup.weights, name="all_features", bias=False) # dot product of f_predicted . all_embeddings with bias for each target word run_logits = tx.Linear(feature_predict,
def _compute_sampled_logits(ri_tensors, weights, bias, labels, partition_const, inputs, num_sampled, num_classes, num_true=1, sampled_values=None, subtract_log_q=True, remove_accidental_hits=False, partition_strategy="mod", name=None, seed=None): if isinstance(weights, variables.PartitionedVariable): weights = list(weights) if not isinstance(weights, list): weights = [weights] with ops.name_scope(name, "compute_sampled_logits", weights + [inputs, labels]): if labels.dtype != dtypes.int64: labels = math_ops.cast(labels, dtypes.int64) labels_flat = array_ops.reshape(labels, [-1]) # Sample the negative labels. # sampled shape: [num_sampled] tensor # true_expected_count shape = [batch_size, 1] tensor # sampled_expected_count shape = [num_sampled] tensor if sampled_values is None: sampled_values = candidate_sampling_ops.uniform_candidate_sampler( true_classes=labels, num_true=num_true, num_sampled=num_sampled, unique=True, range_max=num_classes, seed=seed) # NOTE: pylint cannot tell that 'sampled_values' is a sequence # pylint: disable=unpacking-non-sequence sampled, true_expected_count, sampled_expected_count = ( array_ops.stop_gradient(s) for s in sampled_values) # pylint: enable=unpacking-non-sequence sampled = math_ops.cast(sampled, dtypes.int64) # labels_flat is a [batch_size * num_true] tensor # sampled is a [num_sampled] int tensor all_ids = array_ops.concat([labels_flat, sampled], 0) # true_ris true_ris = tx.gather_sparse(sp_tensor=ri_tensors, ids=labels_flat) sampled_ris = tx.gather_sparse(sp_tensor=ri_tensors, ids=sampled) true_w = embedding_lookup_sparse(params=weights, sp_ids=tx.sparse_indices(true_ris), sp_weights=true_ris, combiner="sum", partition_strategy=partition_strategy) noise_w = embedding_lookup_sparse(params=weights, sp_ids=tx.sparse_indices(sampled_ris), sp_weights=sampled_ris, combiner="sum", partition_strategy=partition_strategy) if bias is not None: sampled_b = embedding_lookup_sparse( params=bias, sp_ids=tx.sparse_indices(sampled_ris), sp_weights=sampled_ris, combiner="sum", partition_strategy=partition_strategy) true_b = embedding_lookup_sparse( params=bias, sp_ids=tx.sparse_indices(true_ris), sp_weights=true_ris, combiner="sum", partition_strategy=partition_strategy) noise_logits = math_ops.matmul(inputs, noise_w, transpose_b=True) dim = array_ops.shape(true_w)[1:2] new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0) true_w_e = array_ops.reshape(true_w, new_true_w_shape) row_wise_dots = math_ops.multiply(array_ops.expand_dims(inputs, 1), true_w_e) # We want the row-wise dot plus biases which yields a # [batch_size, num_true] tensor of true_logits. dots_as_matrix = array_ops.reshape(row_wise_dots, array_ops.concat([[-1], dim], 0)) true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true]) if bias is not None: true_b = array_ops.reshape(true_b, [-1, num_true]) true_logits += true_b noise_logits += sampled_b # TODO need to review how to do this Z # true_logits = true_logits * math_ops.exp(partition_const) if remove_accidental_hits: acc_hits = candidate_sampling_ops.compute_accidental_hits( labels, sampled, num_true=num_true) acc_indices, acc_ids, acc_weights = acc_hits # This is how SparseToDense expects the indices. acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1]) acc_ids_2d_int32 = array_ops.reshape( math_ops.cast(acc_ids, dtypes.int32), [-1, 1]) sparse_indices = array_ops.concat([acc_indices_2d, acc_ids_2d_int32], 1, "sparse_indices") # Create sampled_logits_shape = [batch_size, num_sampled] sampled_logits_shape = array_ops.concat( [array_ops.shape(labels)[:1], array_ops.expand_dims(num_sampled, 0)], 0) if noise_logits.dtype != acc_weights.dtype: acc_weights = math_ops.cast(acc_weights, noise_logits.dtype) noise_logits += sparse_ops.sparse_to_dense( sparse_indices, sampled_logits_shape, acc_weights, default_value=0.0, validate_indices=False) if subtract_log_q: # Subtract log of Q(l), prior probability that l appears in sampled. true_logits -= math_ops.log(true_expected_count) noise_logits -= math_ops.log(sampled_expected_count) # Construct output logits and labels. The true labels/logits start at col 0. out_logits = array_ops.concat([true_logits, noise_logits], 1) # true_logits is a float tensor, ones_like(true_logits) is a float # tensor of ones. We then divide by num_true to ensure the per-example # labels sum to 1.0, i.e. form a proper probability distribution. out_labels = array_ops.concat([ array_ops.ones_like(true_logits) / num_true, array_ops.zeros_like(noise_logits) ], 1) # out_logits = math_ops.div(out_logits,math_ops.exp(partition_const)) # out_logits = out_logits / (partition_const + 1) return out_logits, out_labels
def _sampled_logits_from_parametric_noise(ri_tensors, k_dim, weights, labels, inputs, input_dim, num_true=1, partition_strategy="mod", name=None): if isinstance(weights, variables.PartitionedVariable): weights = list(weights) if not isinstance(weights, list): weights = [weights] with ops.name_scope(name, "compute_sampled_logits", weights + [inputs, labels]): if labels.dtype != dtypes.int64: labels = math_ops.cast(labels, dtypes.int64) labels_flat = array_ops.reshape(labels, [-1]) # true_ris true_ris = tx.gather_sparse(sp_tensor=ri_tensors, ids=labels_flat) true_w = embedding_lookup_sparse(params=weights, sp_ids=tx.sparse_indices(true_ris), sp_weights=true_ris, combiner="sum", partition_strategy=partition_strategy) label_layer = tx.TensorLayer(true_w, input_dim) noise_fn = tx.FC(label_layer, 512, activation=tx.relu) noise_fn_sp = tx.ToSparse(noise_fn) noise_ris = tx.Linear(noise_fn_sp, k_dim, weight_init=tx.glorot_uniform(), bias=True) noise_ris_sp = tx.ToSparse(noise_ris) noise_w = embedding_lookup_sparse(params=weights, sp_ids=tx.sparse_indices(noise_ris_sp.tensor), sp_weights=noise_ris_sp.tensor, combiner="sum", partition_strategy=partition_strategy) noise_logits = math_ops.matmul(inputs, noise_w, transpose_b=True) dim = array_ops.shape(true_w)[1:2] new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0) true_w_e = array_ops.reshape(true_w, new_true_w_shape) row_wise_dots = math_ops.multiply(array_ops.expand_dims(inputs, 1), true_w_e) # We want the row-wise dot plus biases which yields a # [batch_size, num_true] tensor of true_logits. dots_as_matrix = array_ops.reshape(row_wise_dots, array_ops.concat([[-1], dim], 0)) true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true]) # Construct output logits and labels. The true labels/logits start at col 0. out_logits = array_ops.concat([true_logits, noise_logits], 1) # true_logits is a float tensor, ones_like(true_logits) is a float # tensor of ones. We then divide by num_true to ensure the per-example # labels sum to 1.0, i.e. form a proper probability distribution. out_labels = array_ops.concat([ array_ops.ones_like(true_logits) / num_true, array_ops.zeros_like(noise_logits) ], 1) # out_logits = out_logits * math_ops.exp(partition_const) # out_logits = out_logits / (partition_const + 1) return out_logits, out_labels
num_rows = tf.shape(tf.reshape(indices, [-1]))[-1] print("num rows \n", num_rows.eval()) column_ids = tf.gather(col_j, row_indices) print(column_ids.eval()) values = tf.gather_nd(sp.values, row_indices) # values = tf.reshape(values, [-1]) print("columns \n", column_ids.eval()) print("values \n", values.eval()) row_col = tf.concat([new_rows, column_ids], axis=-1) print("new indices \n", row_col.eval()) # stack must be 0 to concat into rank 1 tensors dense_shape = tf.stack([tf.cast(num_rows, tf.int64), sp.dense_shape[-1]]) print("i ", row_col) print("v ", values) print("s ", dense_shape) gather_sp = tf.SparseTensor(indices=row_col, values=values, dense_shape=dense_shape) dense = tf.sparse_tensor_to_dense(gather_sp) print(dense.eval()) print(gather_sp.eval()) gather_sp_tx = tx.gather_sparse(sp, indices) print(tf.sparse_tensor_to_dense(gather_sp_tx).eval())
sampled_values = sampling_ops.uniform_candidate_sampler( true_classes=labels, num_true=num_true, num_sampled=num_samples, unique=True, range_max=vocab_size, seed=None) sampled, true_expected_count, sampled_expected_count = ( tf.stop_gradient(s) for s in sampled_values) sampled = tf.cast(sampled, tf.int64) all_ids = tf.concat([labels_flat, sampled], 0) all_ris = tx.gather_sparse(ri_tensor, all_ids) # Retrieve the true weights and the logits of the sampled weights. # weights shape is [num_classes, dim] ri_layer = tx.TensorLayer(ri_tensor, k) l = tx.Linear(ri_layer, embed_size, weight_init=tx.random_normal(0, 1), bias=True) weights = l.weights sp_values = all_ris sp_indices = tx.sparse_indices(sp_values) all_w = tf.nn.embedding_lookup_sparse( weights, sp_indices, sp_values, combiner="sum") tf.global_variables_initializer().run()
def __init__(self, ctx_size, vocab_size, k_dim, ri_tensor: RandomIndexTensor, embed_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=True, logit_bias=False, use_gate=True, use_hidden=False, h_dim=100, h_activation=tx.elu, h_init=tx.he_normal_init(), h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), use_dropout=True, embed_dropout=False, keep_prob=0.95, l2_loss=False, l2_loss_coef=1e-5): # GRAPH INPUTS run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input") loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target") eval_inputs = loss_inputs # RUN GRAPH ===================================================== var_reg = [] with tf.name_scope("run"): # RI ENCODING =============================================== # convert ids to ris gather a set of random indexes based on the ids in a sequence # ri_layer = tx.TensorLayer(ri_tensor, n_units=k_dim) # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor) with tf.name_scope("ri_encode"): # used to compute logits if isinstance(ri_tensor, RandomIndexTensor): ri_layer = tx.TensorLayer(ri_tensor.to_sparse_tensor(), k_dim) ri_inputs = ri_tensor.gather(run_inputs.tensor) ri_inputs = ri_inputs.to_sparse_tensor() ri_inputs = tx.TensorLayer(ri_inputs, k_dim) else: ri_layer = tx.TensorLayer(ri_tensor, k_dim) ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor) ri_inputs = tx.TensorLayer(ri_inputs, k_dim) # use those sparse indexes to lookup a set of features based on the ri values feature_lookup = tx.Lookup(ri_inputs, ctx_size, [k_dim, embed_dim], embed_init, name="lookup") var_reg.append(feature_lookup.weights) feature_lookup = feature_lookup.as_concat() # =========================================================== if use_gate or use_hidden: hl = tx.Linear(feature_lookup, h_dim, h_init, bias=True, name="h_linear") ha = tx.Activation(hl, h_activation, name="h_activation") h = tx.Compose(hl, ha, name="hidden") var_reg.append(hl.weights) features = feature_lookup if use_gate: features = tx.Gate(features, ctx_size, gate_input=h) gate = features var_reg.append(features.gate_weights) x_to_f = tx.Linear(features, embed_dim, x_to_f_init, bias=True, name="x_to_f") var_reg.append(x_to_f.weights) f_prediction = x_to_f if use_hidden: h_to_f = tx.Linear(h, embed_dim, h_to_f_init, bias=True, name="h_to_f") var_reg.append(h_to_f.weights) f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted") # RI DECODING =============================================== shared_weights = feature_lookup.weights if embed_share else None logit_init = logit_init if not embed_share else None # embedding feature vectors for all words: shape [vocab_size, embed_dim] # later, for NCE we don't need to get all the features all_embeddings = tx.Linear(ri_layer, embed_dim, logit_init, shared_weights, name="logits", bias=False) # dot product of f_predicted . all_embeddings with bias for each target word run_logits = tx.Linear(f_prediction, n_units=vocab_size, shared_weights=all_embeddings.tensor, transpose_weights=True, bias=logit_bias) if not embed_share: var_reg.append(all_embeddings.weights) # =========================================================== run_embed_prob = tx.Activation(run_logits, tx.softmax) # TRAIN GRAPH =================================================== with tf.name_scope("train"): if use_dropout and embed_dropout: feature_lookup = feature_lookup.reuse_with(ri_inputs) features = tx.Dropout(feature_lookup, probability=keep_prob) else: features = feature_lookup if use_gate or use_hidden: if use_dropout: h = h.reuse_with(features) h = tx.Dropout(h, probability=keep_prob) if use_gate: features = gate.reuse_with(features, gate_input=h) f_prediction = x_to_f.reuse_with(features) if use_hidden: h_to_f = h_to_f.reuse_with(h) if use_dropout: h_to_f = tx.Dropout(h_to_f, probability=keep_prob) f_prediction = tx.Add(f_prediction, h_to_f) else: f_prediction = f_prediction.reuse_with(features) # we already define all_embeddings from which these logits are computed before so this should be ok train_logits = run_logits.reuse_with(f_prediction) train_embed_prob = tx.Activation(train_logits, tx.softmax, name="train_output") one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor, num_cols=vocab_size) train_loss = tx.categorical_cross_entropy(one_hot, train_logits.tensor) train_loss = tf.reduce_mean(train_loss) if l2_loss: losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = train_loss + l2_loss_coef * tf.add_n(losses) # EVAL GRAPH =============================================== with tf.name_scope("eval"): one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor, num_cols=vocab_size) eval_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor) eval_loss = tf.reduce_mean(eval_loss) # SETUP MODEL CONTAINER ==================================== super().__init__(run_inputs=run_inputs, run_outputs=run_embed_prob, train_inputs=run_inputs, train_outputs=train_embed_prob, eval_inputs=run_inputs, eval_outputs=run_embed_prob, train_out_loss=train_loss, train_in_loss=loss_inputs, eval_out_score=eval_loss, eval_in_score=eval_inputs)