def _apply_sparse(self, cache): """""" x_tm1, g_t, idxs = cache['x_tm1'], cache['g_t'], cache['idxs'] idxs, idxs_ = tf.unique(idxs) g_t_ = tf.unsorted_segment_sum(g_t, idxs_, tf.size(idxs)) updates = cache['updates'] if self.mu > 0: m_t, t_m = self._sparse_moving_average(x_tm1, idxs, g_t_, 'm', beta=self.mu) m_t_ = tf.gather(m_t, idxs) m_bar_t_ = (1-self.gamma) * m_t_ + self.gamma * g_t_ updates.extend([m_t, t_m]) else: m_bar_t_ = g_t_ if self.nu > 0: v_t, t_v = self._sparse_moving_average(x_tm1, idxs, g_t_**2, 'v', beta=self.nu) v_t_ = tf.gather(v_t, idxs) v_bar_t_ = tf.sqrt(v_t_ + self.epsilon) updates.extend([v_t, t_v]) else: v_bar_t_ = 1 s_t_ = self.learning_rate * m_bar_t_ / v_bar_t_ cache['s_t'] = s_t_ cache['g_t'] = g_t_ cache['idxs'] = idxs return cache
def _grad_variance(self): """Estimate of gradient Variance. Returns: C_t ops. """ grad_var_ops = [] tensor_to_avg = [] for t, g in zip(self._vars, self._grad): if isinstance(g, tf.IndexedSlices): tensor_to_avg.append( tf.reshape(tf.unsorted_segment_sum(g.values, g.indices, g.dense_shape[0]), shape=t.get_shape())) else: tensor_to_avg.append(g) avg_op = self._moving_averager.apply(tensor_to_avg) grad_var_ops.append(avg_op) with tf.control_dependencies([avg_op]): self._grad_avg = [ self._moving_averager.average(val) for val in tensor_to_avg ] self._grad_avg_squared = [tf.square(val) for val in self._grad_avg] # Compute Variance self._grad_var = tf.maximum( tf.constant(1e-6, dtype=self._grad_norm_squared_avg.dtype), self._grad_norm_squared_avg - tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared])) if self._sparsity_debias: self._grad_var *= self._sparsity_avg return grad_var_ops # C_t
def _apply_sparse(self, cache): """""" g_t, idxs = cache['g_t'], cache['idxs'] idxs, idxs_ = tf.unique(idxs) g_t_ = tf.unsorted_segment_sum(g_t, idxs_, tf.size(idxs)) cache['g_t'] = g_t_ cache['idxs'] = idxs cache['s_t'] = self.learning_rate * g_t_ return cache
def accumulate_sparse_gradients(grad): """Accumulates repeated indices of a sparse gradient update. Args: grad: a tf.IndexedSlices gradient Returns: grad_indices: unique indices grad_values: gradient values corresponding to the indices """ grad_indices, grad_segments = tf.unique(grad.indices) grad_values = tf.unsorted_segment_sum(grad.values, grad_segments, tf.size(grad_indices)) return grad_indices, grad_values
def index_softmax(values: tf.Tensor, indices: tf.Tensor, n_indices: int) -> tf.Tensor: """Compute multiple softmax() in groups defined by indices. E.g. index_softmax([0, 0, ln(2), 2], [0, 0, 0, 1], 2) computes softmax([0, 0, ln(2)]) and softmax([2]) => [0.25, 0.25, 0.5, 1.0] Acts over axis=0 of values. """ # Run everything in float32, for stability dtype = values.dtype values = tf.cast(values, tf.float32) max_values = tf.reduce_max(values, axis=0, keepdims=True) exp_values = tf.exp(values - max_values) # Max(*, 1e-6) prevents a DIV0 error, caused by underflow of the sum-exp. sum_exp_values = tf.maximum( tf.unsorted_segment_sum(exp_values, indices, n_indices), 1e-6) return tf.cast(exp_values / tf.gather(sum_exp_values, indices), dtype)
def transformer_conv( n_output: int, n_heads: int, dropout: float, nodes: tf.Tensor, edge_idx: tf.Tensor, edges: tf.Tensor, ) -> tf.Tensor: """Implementation of Graph Transformer, https://arxiv.org/abs/2009.03509. Matches the specification of TransformerConv in PyTorch Geometric, always using a "skip" projection from inputs and shared key/value projections for edges. Arguments: n_output -- output feature size n_heads -- number of attention heads (note: head size is given by n_output/n_heads) dropout -- rate parameter for attention mask (post-softmax) dropout nodes -- shape (n_nodes, node_feature_size), input features for each node edge_idx -- shape (2, n_edges), (0 <= edge_idx < n_nodes), the source and destination of each edge, indexing into nodes edges -- shape (n_edges, edge_feature_size), input features for each edge Returns: tensor of shape (n_nodes, n_output), node features after applying a graph transformer (attention) layer """ assert n_output % n_heads == 0, \ "graph transformer output size should be divisible by the number of heads" head_size = n_output // n_heads n_nodes, _ = assert_shape(nodes, (None, None)) _, n_edges = assert_shape(edge_idx, (2, None)) assert_shape(edges, (n_edges, None)) with tf.variable_scope("skip"): skip = linear(nodes, n_output) with tf.variable_scope("edge_shared_kv"): edge_kv = linear(edges, n_output, use_bias=False) with tf.variable_scope("node_qkv"): node_qkv = linear(nodes, 3 * n_output) with tf.variable_scope("attention"): q = tf.gather(node_qkv[:, :n_output], edge_idx[1]) kv = tf.reshape( tf.gather(node_qkv[:, n_output:], edge_idx[0]), (n_edges, 2, n_output), ) k, v = tf.unstack(kv + edge_kv[:, tf.newaxis, :], axis=1) a = tf.reduce_sum(tf.reshape(q * k, (n_edges, n_heads, head_size)), -1) / (head_size**0.5) a = index_softmax(a, edge_idx[1], n_nodes) if dropout: a = tf.nn.dropout(a, rate=dropout) attention = tf.unsorted_segment_sum( tf.repeat(a, head_size, axis=1) * v, edge_idx[1], n_nodes) return skip + attention
def data_group_avg(assignments, data): sum_total = tf.unsorted_segment_sum(data, assignments, 3) num_total = tf.unsorted_segment_sum(tf.ones_like(data), assignments, 3) avg_by_group = sum_total / num_total return avg_by_group
def follow_mention(batch_entities, relation_st_qry, relation_en_qry, entity_word_ids, entity_word_masks, ent2ment_ind, ent2ment_val, ment2ent_map, word_emb_table, word_weights, mips_search_fn, tf_db, hidden_size, mips_config, qa_config, is_training, ensure_index=None): """Sparse implementation of the relation follow operation. Args: batch_entities: [batch_size, num_entities] SparseTensor of incoming entities and their scores. relation_st_qry: [batch_size, dim] Tensor representating start query vectors for dense retrieval. relation_en_qry: [batch_size, dim] Tensor representating end query vectors for dense retrieval. entity_word_ids: [num_entities, max_entity_len] Tensor holding word ids of each entity. entity_word_masks: [num_entities, max_entity_len] Tensor with masks into word ids above. ent2ment_ind: [num_entities, num_mentions] RaggedTensor mapping entities to mention indices which co-occur with them. ent2ment_val: [num_entities, num_mentions] RaggedTensor mapping entities to mention scores which co-occur with them. ment2ent_map: [num_mentions] Tensor mapping mentions to their entities. word_emb_table: [vocab_size, dim] Tensor of word embedddings. (?) word_weights: [vocab_size, 1] Tensor of word weights. (?) mips_search_fn: Function which accepts a dense query vector and returns the top-k indices closest to it (from the tf_db). tf_db: [num_mentions, 2 * dim] Tensor of mention representations. hidden_size: Scalar dimension of word embeddings. mips_config: MIPSConfig object. qa_config: QAConfig object. is_training: Boolean. ensure_index: [batch_size] Tensor of mention ids. Only needed if `is_training` is True. (? each example only one ensure entity?) Returns: ret_mentions_ids: [batch_size, k] Tensor of retrieved mention ids. ret_mentions_scs: [batch_size, k] Tensor of retrieved mention scores. ret_entities_ids: [batch_size, k] Tensor of retrieved entities ids. """ if qa_config.entity_score_threshold is not None: # Remove the entities which have scores lower than the threshold. mask = tf.greater(batch_entities.values, qa_config.entity_score_threshold) batch_entities = tf.sparse.retain(batch_entities, mask) batch_size = batch_entities.dense_shape[0] # number of the batches batch_ind = batch_entities.indices[:, 0] # the list of the batch ids entity_ind = batch_entities.indices[:, 1] # the list of the entity ids entity_scs = batch_entities.values # the list of the scores of each entity # Obtain BOW embeddings for the given set of entities. # [NNZ, dim] NNZ (number of non-zero entries) = len(entity_ind) batch_entity_emb = model_utils.entity_emb(entity_ind, entity_word_ids, entity_word_masks, word_emb_table, word_weights) batch_entity_emb = batch_entity_emb * tf.expand_dims(entity_scs, axis=1) # [batch_size, dim] uniq_batch_ind, uniq_idx = tf.unique(batch_ind) agg_emb = tf.unsorted_segment_sum(batch_entity_emb, uniq_idx, tf.shape(uniq_batch_ind)[0]) batch_bow_emb = tf.scatter_nd( tf.expand_dims(uniq_batch_ind, 1), agg_emb, tf.stack([batch_size, hidden_size], axis=0)) batch_bow_emb.set_shape([None, hidden_size]) if qa_config.projection_dim is not None: with tf.variable_scope("projection"): batch_bow_emb = contrib_layers.fully_connected( batch_bow_emb, qa_config.projection_dim, activation_fn=tf.nn.tanh, reuse=tf.AUTO_REUSE, scope="bow_projection") # Each instance in a batch has onely one vector as embedding. # Ragged sparse search. # (num_batch x num_entities) * (num_entities x num_mentions) # [batch_size x num_mentions] sparse sp_mention_vec = model_utils.sparse_ragged_mul( batch_entities, ent2ment_ind, ent2ment_val, batch_size, mips_config.num_mentions, qa_config.sparse_reduce_fn, # max or sum threshold=qa_config.entity_score_threshold, fix_values_to_one=qa_config.fix_sparse_to_one) if is_training and qa_config.ensure_answer_sparse: ensure_indices = tf.stack([tf.range(batch_size), ensure_index], axis=-1) sp_ensure_vec = tf.SparseTensor( tf.cast(ensure_indices, tf.int64), tf.ones([batch_size]), dense_shape=[batch_size, mips_config.num_mentions]) sp_mention_vec = tf.sparse.add(sp_mention_vec, sp_ensure_vec) sp_mention_vec = tf.SparseTensor( indices=sp_mention_vec.indices, values=tf.minimum(1., sp_mention_vec.values), dense_shape=sp_mention_vec.dense_shape) # Dense scam search. # [batch_size, 2 * dim] # Constuct query embeddings (dual encoder: [subject; relation]). scam_qrys = tf.concat( [batch_bow_emb + relation_st_qry, batch_bow_emb + relation_en_qry], axis=1) with tf.device("/cpu:0"): # [batch_size, num_neighbors] _, ret_mention_ids = mips_search_fn(scam_qrys) if is_training and qa_config.ensure_answer_dense: ret_mention_ids = model_utils.ensure_values_in_mat( ret_mention_ids, ensure_index, tf.int32) # [batch_size, num_neighbors, 2 * dim] ret_mention_emb = tf.gather(tf_db, ret_mention_ids) if qa_config.l2_normalize_db: ret_mention_emb = tf.nn.l2_normalize(ret_mention_emb, axis=2) # [batch_size, 1, num_neighbors] ret_mention_scs = tf.matmul( tf.expand_dims(scam_qrys, 1), ret_mention_emb, transpose_b=True) # [batch_size, num_neighbors] ret_mention_scs = tf.squeeze(ret_mention_scs, 1) # [batch_size, num_mentions] sparse dense_mention_vec = model_utils.convert_search_to_vector( ret_mention_scs, ret_mention_ids, tf.cast(batch_size, tf.int32), mips_config.num_neighbors, mips_config.num_mentions) # Combine sparse and dense search. if (is_training and qa_config.train_with_sparse) or ( (not is_training) and qa_config.predict_with_sparse): # [batch_size, num_mentions] sparse if qa_config.sparse_strategy == "dense_first": ret_mention_vec = model_utils.sp_sp_matmul(dense_mention_vec, sp_mention_vec) elif qa_config.sparse_strategy == "sparse_first": with tf.device("/cpu:0"): ret_mention_vec = model_utils.rescore_sparse(sp_mention_vec, tf_db, scam_qrys) else: raise ValueError("Unrecognized sparse_strategy %s" % qa_config.sparse_strategy) else: # [batch_size, num_mentions] sparse ret_mention_vec = dense_mention_vec # Get entity scores and ids. # [batch_size, num_entities] sparse entity_indices = tf.cast( tf.gather(ment2ent_map, ret_mention_vec.indices[:, 1]), tf.int64) ret_entity_vec = tf.SparseTensor( indices=tf.concat( [ret_mention_vec.indices[:, 0:1], tf.expand_dims(entity_indices, 1)], axis=1), values=ret_mention_vec.values, dense_shape=[batch_size, qa_config.num_entities]) return ret_entity_vec, ret_mention_vec, dense_mention_vec, sp_mention_vec
def random_spans_noise_mask(length=200, noise_density=0.15, mean_noise_span_length=3.0): """Noise mask consisting of random spans of noise tokens. The number of noise tokens and the number of noise spans and non-noise spans are determined deterministically as follows: num_noise_tokens = round(length * noise_density) num_nonnoise_spans = num_noise_spans = round( num_noise_tokens / mean_noise_span_length) Spans alternate between non-noise and noise, beginning with non-noise. Subject to the above restrictions, all masks are equally likely. Args: length: an int32 scalar (length of the incoming token sequence) noise_density: a float - approximate density of output mask mean_noise_span_length: a number Returns: a boolean tensor with shape [length] """ orig_length = length # increase length to avoid degeneracy length = tf.maximum(length, 2) def to_int(x): return tf.cast(x, tf.int32) def to_float(x): return tf.cast(x, tf.float32) num_noise_tokens = to_int(tf.round(to_float(length) * noise_density)) # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens. num_noise_tokens = tf.minimum(tf.maximum(num_noise_tokens, 1), length - 1) num_noise_spans = to_int( tf.round(to_float(num_noise_tokens) / mean_noise_span_length)) # avoid degeneracy by ensuring positive number of noise spans num_noise_spans = tf.maximum(num_noise_spans, 1) num_nonnoise_tokens = length - num_noise_tokens # pick the lengths of the noise spans and the non-noise spans def _random_segmentation(num_items, num_segments): """Partition a sequence of items randomly into non-empty segments. Args: num_items: an integer scalar > 0 num_segments: an integer scalar in [1, num_items] Returns: a Tensor with shape [num_segments] containing positive integers that add up to num_items """ first_in_segment = tf.pad( tf.random.shuffle( to_int(tf.range(num_items - 1) < num_segments - 1), seed=123), [[1, 0]]) segment_id = tf.cumsum(first_in_segment) segment_length = tf.segment_sum(tf.ones_like(segment_id), segment_id) return segment_length noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans) nonnoise_span_lengths = _random_segmentation(num_nonnoise_tokens, num_noise_spans) interleaved_span_lengths = tf.reshape( tf.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), [num_noise_spans * 2]) span_starts = tf.cumsum(interleaved_span_lengths)[:-1] span_start_indicator = tf.unsorted_segment_sum(tf.ones_like(span_starts), span_starts, length) span_num = tf.cumsum(span_start_indicator) is_noise = tf.equal(span_num % 2, 1) return is_noise[:orig_length].numpy()