def forward_incidence_matrix(self, normalization): if normalization[0] == "none": mtr_values = tf.to_float(tf.ones_like(self.receiver_indices)) message_indices = tf.range(self.edge_count) mtr_indices = tf.to_int64( tf.transpose(tf.stack([self.receiver_indices, message_indices]))) mtr_shape = tf.to_int64( tf.stack([self.vertex_count, self.edge_count])) tensor = tf.SparseTensor(indices=mtr_indices, values=mtr_values, dense_shape=mtr_shape) return tensor elif normalization[0] == "global": mtr_values = tf.to_float( tf.ones_like(self.receiver_indices) ) # mtr_values can be normalized weights, eg. intensities message_indices = tf.range(self.edge_count) mtr_indices = tf.to_int64( tf.transpose(tf.stack([self.receiver_indices, message_indices]))) mtr_shape = tf.to_int64( tf.stack([self.vertex_count, self.edge_count])) tensor = tf.sparse_softmax( tf.SparseTensor(indices=mtr_indices, values=mtr_values, dense_shape=mtr_shape)) return tensor elif normalization[0] == "local": mtr_values = tf.to_float(tf.ones_like(self.receiver_indices)) message_indices = tf.range(self.edge_count) mtr_indices = tf.to_int64( tf.transpose( tf.stack([ self.message_types, self.receiver_indices, message_indices ]))) mtr_shape = tf.to_int64( tf.stack( [self.label_count * 2, self.vertex_count, self.edge_count])) tensor = tf.sparse_softmax( tf.SparseTensor(indices=mtr_indices, values=mtr_values, dense_shape=mtr_shape)) tensor = tf.sparse_reduce_sum_sparse(tensor, 0) return tensor
def sp_attn_head(seq, out_sz, adj_mat_local, adj_mat_global, activation, in_drop=0.0, coef_drop=0.0, residual=False): with tf.name_scope('my_attn'): if in_drop != 0.0: seq = tf.nn.dropout(seq, 1.0 - in_drop) seq_fts = seq latent_factor_size = 8 nb_nodes = seq_fts.shape[1].value w_1 = glorot([seq_fts.shape[2].value, latent_factor_size]) w_2 = glorot([3 * seq_fts.shape[2].value, latent_factor_size]) f_1 = tf.layers.conv1d(seq_fts, 1, 1) f_2 = tf.layers.conv1d(seq_fts, 1, 1) #local neighbours logits = tf.add(f_1[0], tf.transpose(f_2[0])) logits_first = adj_mat_local * logits lrelu = tf.SparseTensor(indices=logits_first.indices, values=tf.nn.leaky_relu(logits_first.values), dense_shape=logits_first.dense_shape) coefs = tf.sparse_softmax(lrelu) coefs = tf.sparse_reshape(coefs, [nb_nodes, nb_nodes]) seq_fts = tf.squeeze(seq_fts) neigh_embs = tf.sparse.sparse_dense_matmul(coefs, seq_fts) #non-local neighbours logits_global = adj_mat_global * logits lrelu_global = tf.SparseTensor(indices=logits_global.indices, values=tf.nn.leaky_relu( logits_global.values), dense_shape=logits_global.dense_shape) coefs_global = tf.sparse_softmax(lrelu_global) coefs_global = tf.sparse_reshape(coefs_global, [nb_nodes, nb_nodes]) neigh_embs_global = tf.sparse.sparse_dense_matmul( coefs_global, seq_fts) neigh_embs_sum_1 = tf.matmul( tf.add(tf.add(seq_fts, neigh_embs), neigh_embs_global), w_1) neigh_embs_sum_2 = tf.matmul( tf.concat( [tf.concat([seq_fts, neigh_embs], axis=-1), neigh_embs_global], axis=-1), w_2) final_embs = activation(neigh_embs_sum_1) + activation( neigh_embs_sum_2) return final_embs
def call(self, inputs): ent_emb = inputs[0] rel_emb = inputs[1] adj = tf.SparseTensor( K.cast(K.squeeze(inputs[2], axis=0), dtype="int64"), K.ones_like(inputs[2][0, :, 0], dtype='float32'), (self.node_size, self.node_size)) rel_adj = K.cast(K.squeeze(inputs[3], axis=0), dtype="int64") rel_adj = tf.SparseTensor(indices=rel_adj, values=tf.ones_like(rel_adj[:, 0], dtype='float32'), dense_shape=(self.node_size, self.rel_size)) rel_adj = tf.sparse_softmax(rel_adj) rel_features = tf.sparse_tensor_dense_matmul(rel_adj, rel_emb) ent_adj = K.cast(K.squeeze(inputs[4], axis=0), dtype="int64") ent_adj = tf.SparseTensor(indices=ent_adj, values=tf.ones_like(ent_adj[:, 0], dtype='float32'), dense_shape=(self.node_size, self.node_size)) ent_adj = tf.sparse_softmax(ent_adj) ent_features = tf.sparse_tensor_dense_matmul(ent_adj, ent_emb) features = K.concatenate([ent_features, rel_features]) outputs = [self.activation(features)] for _ in range(self.depth): features_list = [] for head in range(self.attn_heads): attention_kernel = self.attn_kernels[head] attn_for_self = K.dot(features, attention_kernel[0]) attn_for_neighs = tf.transpose( K.dot(features, attention_kernel[1]), [1, 0]) att = tf.sparse_add(adj * attn_for_self, adj * attn_for_neighs) att = tf.SparseTensor(indices=att.indices, values=tf.nn.leaky_relu(att.values), dense_shape=att.dense_shape) att = tf.sparse_softmax(att) new_features = tf.sparse_tensor_dense_matmul(att, features) if self.use_bias: new_features = K.bias_add(new_features, self.biases[head]) features_list.append(new_features) if self.attn_heads_reduction == 'concat': features = K.concatenate(features_list) else: features = K.mean(K.stack(features_list), axis=0) features = self.activation(features) outputs.append(features) outputs = K.concatenate(outputs) return outputs
def get_log_prob(model, action_placeholder, mask_placeholder): action_dim = 9 logits = model indices = tf.where(mask_placeholder) values = tf.gather_nd(logits, indices) denseShape = tf.cast(tf.shape(logits), tf.int64) """THIS IS THE KEY: tensorflow will automatically set output probabilities to zero of undesignated entries in sparse vector""" sparseResult = tf.sparse_softmax( tf.SparseTensor(indices, values, denseShape)) probability_dist = tf.scatter_nd(sparseResult.indices, sparseResult.values, sparseResult.dense_shape) # probability_dist = probability_dist.set_shape(logits.shape) log_probability_dist = tf.scatter_nd(sparseResult.indices, tf.log(sparseResult.values), sparseResult.dense_shape) """Want to emulate this:""" # probability_dist = tf.nn.softmax(logits) # legal_pseudo_probability_dist = probability_dist*values # legalprobability_dist = tf.divide(legal_pseudo_probability_dist, tf.reduce_sum(legal_pseudo_probability_dist, axis= 1)) prod = tf.multiply(probability_dist, tf.one_hot(action_placeholder, action_dim)) entropy = -tf.reduce_sum(probability_dist * log_probability_dist, axis=1) log_prob = tf.log(tf.reduce_sum(prod, axis=1)) # log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(labels= action_placeholder, logits= tf.SparseTensor(indices, values, denseShape)) return log_prob, entropy
def attention(self, c, mem, existing_facts): with tf.variable_scope("attending") as scope: attending = tf.concat([ c, mem, self.re_q, c * self.re_q, c * mem, (c - self.re_q)**2, (c - mem)**2 ], 2) m1 = tf.matmul( attending * existing_facts, tf.tile(self.w_1, tf.stack([tf.shape(attending)[0], 1, 1 ]))) * existing_facts bias_1 = self.b_1 * existing_facts tnhan = tf.nn.relu(m1 + bias_1) m2 = tf.matmul( tnhan, tf.tile(self.w_2, tf.stack([tf.shape(attending)[0], 1, 1]))) bias_2 = self.b_2 * existing_facts norm_m2 = tf.nn.l2_normalize(m2 + bias_2, -1) softmax_idx = tf.where(tf.not_equal(norm_m2, 0))[:, :-1] softmax_gather = tf.gather_nd(norm_m2[..., 0], softmax_idx) softmax_shape = tf.shape(norm_m2, out_type=tf.int64)[:-1] softmaxable = tf.SparseTensor(softmax_idx, softmax_gather, softmax_shape) return tf.expand_dims( tf.sparse_tensor_to_dense(tf.sparse_softmax(softmaxable)), -1)
def sp_attn_head(seq, out_sz, adj_mat, adj_all_mat, adj_neig_mat, N_target_mat, activation, nb_nodes, in_drop=0.0, coef_drop=0.0, residual=False): with tf.name_scope('sp_attn'): if coef_drop != 0.0: adj_mat = tf.SparseTensor(indices=adj_mat.indices, values=tf.nn.dropout( adj_mat.values, 1.0 - coef_drop), dense_shape=adj_mat.dense_shape) adj_neig_mat = tf.SparseTensor( indices=adj_neig_mat.indices, values=tf.nn.dropout(adj_neig_mat.values, 1.0 - coef_drop), dense_shape=adj_neig_mat.dense_shape) if in_drop != 0.0: seq = tf.nn.dropout(seq, 1.0 - in_drop) seq_fts = tf.layers.conv1d(seq, out_sz, 1, use_bias=False) # simplest self-attention possible f_1 = tf.layers.conv1d(seq_fts, 1, 1) f_2 = tf.layers.conv1d(seq_fts, 1, 1) f_1 = tf.reshape(f_1, (nb_nodes, 1)) f_2 = tf.reshape(f_2, (nb_nodes, 1)) f_1 = adj_mat * f_1 f_2 = adj_mat * tf.transpose(f_2, [1, 0]) logits = tf.sparse_add(f_1, f_2) lrelu = tf.SparseTensor(indices=logits.indices, values=tf.nn.leaky_relu(logits.values), dense_shape=logits.dense_shape) coefs = tf.sparse_softmax(lrelu) if in_drop != 0.0: seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop) coefs = tf.sparse_reshape(coefs, [nb_nodes, nb_nodes]) seq_fts = tf.squeeze(seq_fts) ###HW out_bi = BILinear_pooling(adj_neig_mat, seq_fts) out_bi = dot(N_target_mat, out_bi, True) out_gat = tf.sparse_tensor_dense_matmul(coefs, seq_fts) vals = (1 - FLAGS.alpha) * out_gat + FLAGS.alpha * out_bi vals = tf.expand_dims(vals, axis=0) vals.set_shape([1, nb_nodes, out_sz]) ret = tf.contrib.layers.bias_add(vals) return activation(ret) # activation
def maskedSoftmax(logits, mask): '''Computes the softmax of our logits, given that some moves are illegal Inputs: Masked softmax over dim 1 param logits: [None, ac_dim] param mask: [None, ac_dim] ***This code is edited from code we found online*** We do not want there to be any probability of making illegal moves. Intuitively, we are computing softmax of our logits, but pretending that the only entries are the legal ones. This is actually implemented via SparseTensor calculations. Returns: result: [None, ac_dim] a sequence of probability distributions, with zero probability of illegal moves ''' indices = tf.where(mask) values = tf.gather_nd(logits, indices) denseShape = tf.cast(tf.shape(logits), tf.int64) # Tensorflow will automatically set output probabilities to zero of # undesignated entries in sparse vector sparseResult = tf.sparse_softmax( tf.SparseTensor(indices, values, denseShape)) result = tf.scatter_nd(sparseResult.indices, sparseResult.values, sparseResult.dense_shape) result.set_shape(logits.shape) return result
def __call__(self, inputs): with tf.name_scope(self.name): x = inputs x = dropout_sparse(x, 1-self.dropout, self.features_nonzero) x = tf.sparse_tensor_dense_matmul(x, self.vars['weights']) # この結果は, denseとなる # XWa = XWa_{self} + XWa_{neigh} と分ける att_self = tf.matmul(x, self.vars['attention_self']) # (N,1) att_neigh = tf.matmul(x, self.vars['attention_neigh']) # (N,1) # SparseTensor型で, attentionの重みを作成 # 注意として, placeholderは入るまで分からないため, 型とか値とかは, この時点では, ? なので, 例えば, # att = tf.SparseTensor(indices=self.adj.indices, values=[att_self[self.adj.indices[i][0]]+att_neigh[self.adj.indices[i][1]] for i in range(self.adj.indices[0].value)], dense_shape=self.adj.dense_shape) # 上記は, self.adj.indices[0].valueがNoneゆえ, ループが回らなく, エラーとなる # (N,N)と(N,1)の要素積は, (N,1)を(N,N)の列方向に各列に掛けていったものになる # (N,N)と(1,N)の要素積は, (1,N)を(N,N)の行方向に各行に掛けていったものになる att_1 = self.adj.__mul__(tf.nn.leaky_relu(att_self, alpha=0.2)) att_2 = self.adj.__mul__(tf.transpose(tf.nn.leaky_relu(att_neigh, alpha=0.2))) att = tf.sparse_add(att_1, att_2) del att_1, att_2 gc.collect() #att = tf.add(att_self, tf.transpose(att_neigh)) # (N,1) + (1,N) → (N,N)にbroadcastされるのを利用 #att = self.adj.__mul__(att) # 隣接行列と要素積を取ったもの(adjはsparseゆえ, __mul__でsparse型との要素積, 結果はsparseとなる) #att = tf.SparseTensor(indices=att.indices, values=tf.nn.leaky_relu(att.values), dense_shape=att.dense_shape) att = tf.sparse_softmax(att) x = tf.sparse_tensor_dense_matmul(att, x) outputs = self.act(x) return outputs
def go(): dense = tf.Variable([[0, 0, 10, 1, 0, 0], [0, 0, -2, 3, 0, 0]], dtype=tf.float32) sm1 = tf.nn.softmax(dense) denseReplacing0WithNeg10 = tf.where( dense > 0.0, dense, tf.ones(tf.shape(dense), tf.float32) * (-10.0)) sm2 = tf.nn.softmax(denseReplacing0WithNeg10) nz_indices = tf.where(tf.not_equal(dense, tf.constant(0, dtype=tf.float32))) nz_values = tf.gather_nd(dense, nz_indices) sparse = tf.SparseTensor(nz_indices, nz_values, dense.get_shape()) sm3 = tf.sparse_softmax(sparse) dm3a = tf.sparse_to_dense(sm3.indices, sm3.get_shape(), sm3.values) dm3b = tf.scatter_nd(sm3.indices, sm3.values, dense.get_shape()) session = tf.Session() session.run(tf.global_variables_initializer()) from tensorflow.python.framework import ops for v in nz_indices, nz_values, sparse, sm3, dm3a, dm3b: print 'gradient of op', v, ops.get_gradient_function(v.op) print 'dense sm - direct', session.run(sm1) print 'dense sm - with -10 trick', session.run(sm2) print 'sparse sm', session.run(sm3) print 'densified sparse sm - old', session.run(dm3a) print 'densified sparse sm - new', session.run(dm3a)
def OLD_to_embedding(self, values, indices): if self.duplicate_policy == "average": pass elif self.duplicate_policy == "sum": from_indices = tf.range( self.variables.get_variable(self.variable_prefix + "n_centroids")) to_indices = indices stg_values = tf.to_float(tf.ones_like(from_indices)) from_size = self.variables.get_variable(self.variable_prefix + "n_centroids") to_size = self.variables.get_variable(self.variable_prefix + "target_embedding_size") stg_indices = tf.to_int64( tf.transpose(tf.stack([from_indices, to_indices]))) stg_shape = tf.to_int64([from_size, to_size]) matrix = tf.sparse_softmax( tf.SparseTensor(indices=stg_indices, values=stg_values, dense_shape=stg_shape)) return tf.sparse_tensor_dense_matmul(matrix, values) else: pass
def call(self, inputs): self_embedding, neigh_embedding, adj = inputs adj = _sparse_ones_like(adj) if self.renorm: eye = _sparse_eye(adj.dense_shape[0]) adj = tf.sparse_concat(1, [eye, adj]) if not self.renorm: from_all = self.dense(neigh_embedding) from_self = self.dense(self_embedding) else: all_embedding = tf.concat([self_embedding, neigh_embedding], 0) from_all = self.dense(all_embedding) from_self = from_all[:adj.dense_shape[0], :] self_weight = self.self_layer(from_self) all_weight = self.neigh_layer(from_all) coefficient = tf.sparse_add(adj * self_weight, adj * tf.reshape(all_weight, [1, -1])) coefficient = tf.SparseTensor( coefficient.indices, tf.nn.leaky_relu(coefficient.values), coefficient.dense_shape) coefficient = tf.sparse_softmax(coefficient) output = tf.sparse_tensor_dense_matmul(coefficient, from_all) if not self.renorm: output = from_self + output if self.activation: output = self.activation(output) return output
def __call__(self, u_inputs, v_inputs, u_size, v_size): x = v_inputs adj_mat = self.adj_mat # simplest self-attention possible f_1 = tf.layers.conv1d(u_inputs, 1, 1) f_2 = tf.layers.conv1d(v_inputs, 1, 1) f_1 = tf.reshape(f_1, (u_size, 1)) f_2 = tf.reshape(f_2, (v_size, 1)) seq_fts = tf.layers.conv1d(x, self.output_dim, 1, use_bias=False) f_1 = adj_mat * f_1 f_2 = adj_mat * tf.transpose(f_2, [1, 0]) logits = tf.sparse_add(f_1, f_2) lrelu = tf.SparseTensor(indices=logits.indices, values=tf.nn.leaky_relu(logits.values), dense_shape=logits.dense_shape) coefs = tf.sparse_softmax(lrelu) coefs = tf.sparse_reshape(coefs, [u_size, v_size]) seq_fts = tf.squeeze(seq_fts) vals = tf.sparse_tensor_dense_matmul(coefs, seq_fts) print('--------vals.shape------', vals.shape) # vals = tf.expand_dims(vals, axis=0) # vals.set_shape([1, nb_nodes, out_sz]) ret = tf.contrib.layers.bias_add(vals) return self.act(ret) # activation
def _call(self, inputs): seq_fts = tf.layers.conv1d(inputs, self.out_sz, 1, use_bias=False) # simplest self-attention possible f_1_t = tf.layers.conv1d(seq_fts, 1, 1) f_2_t = tf.layers.conv1d(seq_fts, 1, 1) f_1 = tf.reshape(f_1_t, (self.nb_nodes, 1)) f_2 = tf.reshape(f_2_t, (self.nb_nodes, 1)) f_1 = self.bias_mat * f_1 f_2 = self.bias_mat * tf.transpose(f_2, [1, 0]) logits = tf.sparse_add(f_1, f_2) lrelu = tf.SparseTensor(indices=logits.indices, values=tf.nn.leaky_relu(logits.values), dense_shape=logits.dense_shape) coefs = tf.sparse_softmax(lrelu) # As tf.sparse_tensor_dense_matmul expects its arguments to have rank-2, # here we make an assumption that our input is of batch size 1, and reshape appropriately. # The method will fail in all other cases! coefs = tf.sparse_reshape(coefs, [self.nb_nodes, self.nb_nodes]) seq_fts = tf.squeeze(seq_fts) vals = tf.sparse_tensor_dense_matmul(coefs, seq_fts) vals = tf.expand_dims(vals, axis=0) vals.set_shape([1, self.nb_nodes, self.out_sz]) ret = self.act(tf.contrib.layers.bias_add(vals)) return ret # activation
def sp_attn_head(seq, out_sz, adj_mat, activation, nb_nodes, in_drop=0.0, coef_drop=0.0, residual=False): with tf.name_scope('sp_attn'): if in_drop != 0.0: seq = tf.nn.dropout(seq, 1.0 - in_drop) seq_fts = tf.layers.conv1d(seq, out_sz, 1, use_bias=False) # simplest self-attention possible f_1 = tf.layers.conv1d(seq_fts, 1, 1) f_2 = tf.layers.conv1d(seq_fts, 1, 1) f_1 = tf.reshape(f_1, (nb_nodes, 1)) f_2 = tf.reshape(f_2, (nb_nodes, 1)) f_1 = adj_mat * f_1 f_2 = adj_mat * tf.transpose(f_2, [1, 0]) logits = tf.sparse_add(f_1, f_2) lrelu = tf.SparseTensor(indices=logits.indices, values=tf.nn.leaky_relu(logits.values), dense_shape=logits.dense_shape) coefs = tf.sparse_softmax(lrelu) if coef_drop != 0.0: coefs = tf.SparseTensor(indices=coefs.indices, values=tf.nn.dropout( coefs.values, 1.0 - coef_drop), dense_shape=coefs.dense_shape) if in_drop != 0.0: seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop) # As tf.sparse_tensor_dense_matmul expects its arguments to have rank-2, # here we make an assumption that our input is of batch size 1, and reshape appropriately. # The method will fail in all other cases! coefs = tf.sparse_reshape(coefs, [nb_nodes, nb_nodes]) seq_fts = tf.squeeze(seq_fts) vals = tf.sparse_tensor_dense_matmul(coefs, seq_fts) vals = tf.expand_dims(vals, axis=0) vals.set_shape([1, nb_nodes, out_sz]) ret = tf.contrib.layers.bias_add(vals) # residual connection if residual: if seq.shape[-1] != ret.shape[-1]: ret = ret + conv1d(seq, ret.shape[-1], 1) # activation else: seq_fts = ret + seq if activation == None: ## for the final layer return ret else: return activation(ret) # activation
def avg(tensor, size): adj = K.cast(K.squeeze(tensor[0], axis=0), dtype="int64") adj = tf.SparseTensor(indices=adj, values=tf.ones_like(adj[:, 0], dtype='float32'), dense_shape=(node_size, size)) adj = tf.sparse_softmax(adj) return tf.sparse_tensor_dense_matmul(adj, tensor[1])
def attention_mechanism(name, v, W_s, W_d, V, cur_embed, left, right, n2n): # a_{i,j} \propto v^\top tanh (W_s (\mu_i + \mu_j)) if name == 'linear': t = tf.sparse_tensor_dense_matmul(sp_a=edge, b=cur_embed) # edge \in \R^{m, n} t = tf.matmul(t, W_s) # m by 16 t = tf.nn.tanh(t) t = tf.matmul(t, tf.reshape(v, [-1, 1])) # m by 1 sparse_attention = tf.SparseTensor(n2n.indices, tf.reshape(t, [-1]), n2n.dense_shape) sparse_attention = tf.sparse_softmax(sparse_attention) # a_{i,j} \propto v^\top tanh (W_s |\mu_i - \mu_j|) elif name == 'abs': t = tf.sparse_tensor_dense_matmul(sp_a=edge, b=cur_embed) # edge \in \R^{m, n} t = tf.abs(t) t = tf.matmul(t, W_s) # m by 16 t = tf.nn.tanh(t) t = tf.matmul(t, tf.reshape(v, [-1, 1])) # m by 1 sparse_attention = tf.SparseTensor(n2n.indices, tf.reshape(t, [-1]), n2n.dense_shape) sparse_attention = tf.sparse_softmax(sparse_attention) # a_{i,j} \propto leakyrelu (\mu_i V \mu_j) elif name == 'bilinear': tl = tf.sparse_tensor_dense_matmul(sp_a=left, b=cur_embed) # m by k tl = tf.matmul(tl, V) tr = tf.sparse_tensor_dense_matmul(sp_a=right, b=cur_embed) t = tf.reduce_sum(tf.multiply(tl, tr), 1, keep_dims=True) t = tf.keras.layers.LeakyReLU(t) sparse_attention = tf.SparseTensor(n2n.indices, tf.reshape(t, [-1]), n2n.dense_shape) sparse_attention = tf.sparse_softmax(sparse_attention) # a_{i,j} \propto v^\top tanh (W_s \mu_i + W_d \mu_j) if name == 'generalized_linear': tl = tf.sparse_tensor_dense_matmul(sp_a=left, b=cur_embed) # m by k tl = tf.matmul(tl, W_s) tr = tf.sparse_tensor_dense_matmul(sp_a=right, b=cur_embed) tr = tf.matmul(tr, W_d) t = tf.nn.tanh(tf.add(tl, tr)) t = tf.matmul(t, tf.reshape(v, [-1, 1])) sparse_attention = tf.SparseTensor(n2n.indices, tf.reshape(t, [-1]), n2n.dense_shape) sparse_attention = tf.sparse_softmax(sparse_attention) else: sys.exit(-1) return sparse_attention
def build_sparse_matrix_softmax(self, idx_non_zero_values, X, dense_shape_A): A = tf.SparseTensorValue(idx_non_zero_values, tf.squeeze(X), dense_shape_A) A = tf.sparse_reorder(A) # n_edges x n_edges A = tf.sparse_softmax(A) return A
def testGradient(self): x_shape = [2, 5, 10] with self.test_session(use_gpu=False): for dtype in [np.float32, np.float64]: x_np = np.random.randn(*x_shape).astype(dtype) x_tf, nnz = _sparsify(x_np) y_tf = tf.sparse_softmax(x_tf) err = tf.test.compute_gradient_error(x_tf.values, (nnz,), y_tf.values, (nnz,)) self.assertLess(err, 1e-4)
def build_sparse_matrix_softmax(self, idx_non_zero_values, X, dense_shape_A): A = tf.SparseTensorValue(idx_non_zero_values, tf.squeeze(X), dense_shape_A) A = tf.sparse_reorder(A) # n_edges x n_edges A = tf.sparse_softmax(A) # dropout after softmax #A = tf.SparseTensorValue(indices=A.indices, # values=tf.nn.dropout(A.values, self.keep_prob), # dense_shape=A.dense_shape) return A
def sp_attn_head(self, seq, in_sz, out_sz, adj_mat, activation, in_drop=0.0, coef_drop=0.0, residual=False, layer_str="", sparse_inputs=False, reuse_scope=None): """ Sparse Attention Head for the GAT layer. Note: the variable scope is necessary to avoid variable duplication across snapshots""" with tf.variable_scope('struct_attn', reuse=reuse_scope): if sparse_inputs: weight_var = tf.get_variable("layer_" + str(layer_str) + "_weight_transform", shape=[in_sz, out_sz], dtype=tf.float32) new_temporal_weight_var = tf.get_variable("layer_" + str(layer_str) + "_new_weight_transform", shape=[out_sz, out_sz], dtype=tf.float32) try: seq_fts = tf.expand_dims(tf.sparse_tensor_dense_matmul(seq, weight_var), axis=0) # [N, F] except: seq_fts = tf.expand_dims(tf.matmul(seq, new_temporal_weight_var), axis=0) # [N, F] else: seq_fts = tf.layers.conv1d(seq, out_sz, 1, use_bias=False, name='layer_' + str(layer_str) + '_weight_transform', reuse=reuse_scope) # Additive self-attention. f_1 = tf.layers.conv1d(seq_fts, 1, 1, name='layer_' + str(layer_str) + '_a1', reuse=reuse_scope) f_2 = tf.layers.conv1d(seq_fts, 1, 1, name='layer_' + str(layer_str) + '_a2', reuse=reuse_scope) f_1 = tf.reshape(f_1, [-1, 1]) # [N, 1] f_2 = tf.reshape(f_2, [-1, 1]) # [N, 1] logits = tf.sparse_add(adj_mat * f_1, adj_mat * tf.transpose(f_2)) # adj_mat is [N, N] (sparse) leaky_relu = tf.SparseTensor(indices=logits.indices, values=self.leaky_relu(logits.values), dense_shape=logits.dense_shape) coefficients = tf.sparse_softmax(leaky_relu) # [N, N] (sparse) if coef_drop != 0.0: coefficients = tf.SparseTensor(indices=coefficients.indices, values=tf.nn.dropout(coefficients.values, 1.0 - coef_drop), dense_shape=coefficients.dense_shape) # [N, N] (sparse) if in_drop != 0.0: seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop) # [N, D] seq_fts = tf.squeeze(seq_fts) values = tf.sparse_tensor_dense_matmul(coefficients, seq_fts) values = tf.reshape(values, [-1, out_sz]) values = tf.expand_dims(values, axis=0) ret = values # [1, N, F] if residual: residual_wt = tf.get_variable("layer_" + str(layer_str) + "_residual_weight", shape=[in_sz, out_sz], dtype=tf.float32) if sparse_inputs: ret = ret + tf.expand_dims(tf.sparse_tensor_dense_matmul(seq, residual_wt), axis=0) # [N, F] * [F, D] = [N, D]. else: ret = ret + tf.layers.conv1d(seq, out_sz, 1, use_bias=False, name='layer_' + str(layer_str) + '_residual_weight', reuse=reuse_scope) return activation(ret)
def attention_mechanism(features, graph_adj, adj_with_self_loops_indices, coefficient_dropout_prob, weight_decay, name): # apply a feedforward network parametrized with a weight vector to the transformed features. input_dim = int(features.get_shape()[1]) a_i = tf.get_variable(f"{name}-att_i", [input_dim, 1], dtype=tf.float32, initializer=tf.glorot_uniform_initializer(), regularizer=slim.l2_regularizer(weight_decay)) a_j = tf.get_variable(f"{name}-att_j", [input_dim, 1], dtype=tf.float32, initializer=tf.glorot_uniform_initializer(), regularizer=slim.l2_regularizer(weight_decay)) tf.add_to_collection(ATTENTION_WEIGHTS, a_i) tf.add_to_collection(ATTENTION_WEIGHTS, a_j) # dims: num_nodes x input_dim, input_dim, 1 -> num_nodes x 1 att_i = tf.matmul(features, a_i) att_i = tf.contrib.layers.bias_add(att_i) # dims: num_nodes x input_dim, input_dim, 1 -> num_nodes x 1 att_j = tf.matmul(features, a_j) att_j = tf.contrib.layers.bias_add(att_j) # Extracts the relevant attention coefficients with respect to the 1-hop neighbours of each node # Method: first extract all the attention coefficients of the left nodes of each edge, then those # of the right nodes and add them up. # The result is a list of relevant attention weights ordered in the same way as the edges in the # sparse adjacency matrix. # dims: num_nodes x 1, num_edges, num_nodes x 1, num_edges -> 1 x num_edges x 1 attention_weights_of_edges = tf.gather(att_i, adj_with_self_loops_indices[0], axis=0) + \ tf.gather(att_j, adj_with_self_loops_indices[1], axis=0) # dims: 1 x num_edges x 1 -> num_edges attention_weights_of_edges = tf.squeeze(attention_weights_of_edges) # blow list of attention weights up into a sparse matrix. Use the coordinates from the original # adjacency matrix to specify which attention weight belongs to which edge. # Simultaneously applies the LeakyReLU as given in the paper. # dims: num_nodes x num_nodes, num_edges -> num_nodes x num_nodes attention_weight_matrix = tf.SparseTensor( indices=graph_adj.indices, values=tf.nn.leaky_relu(attention_weights_of_edges, alpha=0.2), dense_shape=graph_adj.dense_shape) # finish the attention by normalizing coefficients using softmax attention_coefficients = tf.sparse_softmax(attention_weight_matrix) # apply dropout to attention coefficients, meaning that in every epoch a single node is only exposed to a # sampled subset of its neighbour attention_coefficients = tf.cond( tf.cast(coefficient_dropout_prob, tf.bool), true_fn=(lambda: dropout_supporting_sparse_tensors( attention_coefficients, 1.0 - coefficient_dropout_prob)), false_fn=(lambda: attention_coefficients)) return attention_coefficients
def _init_weights(self): indices = np.vstack((self.graph.tocoo().row, self.graph.tocoo().col)).T self.values = tf.get_variable( 'weights', shape=self.graph.tocoo().row.shape) weights_unnormalized = tf.SparseTensor( indices, self.values, [self.num_nodes, self.num_nodes]) weights = tf.sparse_softmax(weights_unnormalized) return weights
def call(self, inputs): outputs = [] features = inputs[0] rel_emb = inputs[1] adj = tf.SparseTensor( K.cast(K.squeeze(inputs[2], axis=0), dtype="int64"), K.ones_like(inputs[2][0, :, 0]), (self.node_size, self.node_size)) sparse_indices = tf.squeeze(inputs[3], axis=0) sparse_val = tf.squeeze(inputs[4], axis=0) features = self.activation(features) outputs.append(features) for l in range(self.depth): features_list = [] for head in range(self.attn_heads): attention_kernel = self.attn_kernels[l][head] rels_sum = tf.SparseTensor(indices=sparse_indices, values=sparse_val, dense_shape=(self.triple_size, self.rel_size)) rels_sum = tf.sparse_tensor_dense_matmul(rels_sum, rel_emb) neighs = K.gather(features, adj.indices[:, 1]) selfs = K.gather(features, adj.indices[:, 0]) rels_sum = tf.nn.l2_normalize(rels_sum, 1) bias = tf.reduce_sum(neighs * rels_sum, 1, keepdims=True) * rels_sum neighs = neighs - 2 * bias att = K.squeeze(K.dot(K.concatenate([selfs, neighs, rels_sum]), attention_kernel), axis=-1) att = tf.SparseTensor(indices=adj.indices, values=att, dense_shape=adj.dense_shape) att = tf.sparse_softmax(att) new_features = tf.segment_sum( neighs * K.expand_dims(att.values, axis=-1), adj.indices[:, 0]) features_list.append(new_features) if self.attn_heads_reduction == 'concat': features = K.concatenate(features_list) # (N x KF') else: features = K.mean(K.stack(features_list), axis=0) features = self.activation(features) outputs.append(features) outputs = K.concatenate(outputs) return outputs
def compute_inference(self, node_features_in, sp_adj_matrix, is_training): adj_matrix_pred = self.edge_model.compute_inference( node_features_in, sp_adj_matrix, is_training) self.adj_matrix_pred = adj_matrix_pred adj_mask = get_sp_topk(adj_matrix_pred, sp_adj_matrix, self.nb_nodes, self.topk) sp_adj_pred = tf.contrib.layers.dense_to_sparse( tf.multiply(adj_mask, tf.nn.leaky_relu(adj_matrix_pred))) sp_adj_pred = tf.sparse_softmax(sp_adj_pred) logits = self.node_model.compute_inference(node_features_in, sp_adj_pred, is_training) return logits, adj_matrix_pred
def attention(self, w_1, b_1, w_2, b_2, context_facts, current_mem, existing_facts, re_question_rnn): """ Custom attention mechanism (constructing similarity measures between each fact, our current memory (i.e. the question vector), and the original question) :param context_facts: a [batch_size, maximum_sentence_count, recurrent_cell_size] tensor that contains all the facts from the contexts. :param current_mem: a [batch_size, maximum_sentence_count, recurrent_cell_size] tensor that contains the current memory. It should be the same memory for all facts for accurate results. :param existing_facts: a [batch_size, maximum_sentence_count, 1] tensor that acts as a binary mask for which facts exist and which do not. :return: """ with tf.variable_scope("attending") as scope: # attending: the metrics by which we decide what to attend to attending = tf.concat([context_facts, current_mem, re_question_rnn, context_facts * re_question_rnn, # compare each fact to the question context_facts * current_mem, # compare each fact with memory (context_facts - re_question_rnn)**2, (context_facts - current_mem)**2], 2) # m1: First layer of multiplied weights for the feed-forward network. # We tile the weights in order to manually broadcast, since tf.matmul does not automatically broadcast batch # matrix multiplication (as of TensorFlow 1.2). m1 = tf.matmul((attending * existing_facts), tf.tile(w_1, tf.stack([tf.shape(attending)[0], 1, 1]))) \ * existing_facts # bias_1: A masked version of the first feed-forward layer's bias over only existing facts. bias_1 = b_1 * existing_facts # tnhan: First nonlinearity. In the original paper, this is a tanh nonlinearity; choosing relu was a design # choice intended to avoid issues with low gradient magnitude when the tanh returned values close to 1 or -1 tnhan = tf.nn.relu(m1 + bias_1) # m2: Second layer of multiplied weights for the feed-forward network. m2 = tf.matmul(tnhan, tf.tile(w_2, tf.stack([tf.shape(attending)[0], 1, 1]))) # bias_2: A masked version of the second feed-forward layer's bias. bias_2 = b_2 * existing_facts # norm_m2: A normalized version of the second layer of weights, which is used to help make sure the softmax # nonlinearity doesn't saturate. norm_m2 = tf.nn.l2_normalize(m2 + bias_2, -1) # softmaxable: A hack in order to use sparse_softmax on an otherwise dense tensor. # We make norm_m2 a sparse tensor, then make it dense again after the operation. softmax_id = tf.where(tf.not_equal(norm_m2, 0))[:,:-1] softmax_gather = tf.gather_nd(norm_m2[...,0], softmax_id) softmax_shape = tf.shape(norm_m2, out_type=tf.int64)[:-1] softmaxable = tf.SparseTensor(softmax_id, softmax_gather, softmax_shape) res = tf.expand_dims(tf.sparse_tensor_to_dense(tf.sparse_softmax(softmaxable)), -1) return res
def attention(c, mem, existing_facts): """ Custom attention mechanism. c: A [batch_size, maximum_sentence_count, recurrent_cell_size] tensor that contains all the facts from the contexts. mem: A [batch_size, maximum_sentence_count, recurrent_cell_size] tensor that contains the current memory. It should be the same memory for all facts for accurate results. existing_facts: A [batch_size, maximum_sentence_count, 1] tensor that acts as a binary mask for which facts exist and which do not. """ with tf.variable_scope("attending") as scope: # attending: The metrics by which we decide what to attend to. attending = tf.concat( [c, mem, re_q, c * re_q, c * mem, (c - re_q)**2, (c - mem)**2], 2) # m1: First layer of multiplied weights for the feed-forward network. # We tile the weights in order to manually broadcast, since tf.matmul does not # automatically broadcast batch matrix multiplication as of TensorFlow 1.2. m1 = tf.matmul(attending * existing_facts, tf.tile(w_1, tf.stack([tf.shape(attending)[0], 1, 1 ]))) * existing_facts # bias_1: A masked version of the first feed-forward layer's bias # over only existing facts. bias_1 = b_1 * existing_facts # tnhan: First nonlinearity. In the original paper, this is a tanh nonlinearity; # choosing relu was a design choice intended to avoid issues with # low gradient magnitude when the tanh returned values close to 1 or -1. tnhan = tf.nn.relu(m1 + bias_1) # m2: Second layer of multiplied weights for the feed-forward network. # Still tiling weights for the same reason described in m1's comments. m2 = tf.matmul(tnhan, tf.tile(w_2, tf.stack([tf.shape(attending)[0], 1, 1]))) # bias_2: A masked version of the second feed-forward layer's bias. bias_2 = b_2 * existing_facts # norm_m2: A normalized version of the second layer of weights, which is used # to help make sure the softmax nonlinearity doesn't saturate. norm_m2 = tf.nn.l2_normalize(m2 + bias_2, -1) # softmaxable: A hack in order to use sparse_softmax on an otherwise dense tensor. # We make norm_m2 a sparse tensor, then make it dense again after the operation. softmax_idx = tf.where(tf.not_equal(norm_m2, 0))[:, :-1] softmax_gather = tf.gather_nd(norm_m2[..., 0], softmax_idx) softmax_shape = tf.shape(norm_m2, out_type=tf.int64)[:-1] softmaxable = tf.SparseTensor(softmax_idx, softmax_gather, softmax_shape) return tf.expand_dims( tf.sparse_tensor_to_dense(tf.sparse_softmax(softmaxable)), -1)
def sp_attn_head(seq, out_sz, adj_mat, adj_hop1_all_mat, adj_hop2_all_mat, adj_hop1_neig_mat, adj_hop2_neig_mat, N_hop1_neig_mat, N_hop2_neig_mat, activation, nb_nodes, in_drop=0.0, coef_drop=0.0, residual=False): with tf.name_scope('sp_attn'): if in_drop != 0.0: seq = tf.nn.dropout(seq, 1.0 - in_drop) seq_fts = tf.layers.conv1d(seq, out_sz, 1, use_bias=False) # simplest self-attention possible ###this is the first layer of GAT f_1 = tf.layers.conv1d(seq_fts, 1, 1) f_2 = tf.layers.conv1d(seq_fts, 1, 1) f_1 = tf.reshape(f_1, (nb_nodes, 1)) f_2 = tf.reshape(f_2, (nb_nodes, 1)) f_1 = adj_mat * f_1 f_2 = adj_mat * tf.transpose(f_2, [1, 0]) logits = tf.sparse_add(f_1, f_2) lrelu = tf.SparseTensor(indices=logits.indices, values=tf.nn.leaky_relu(logits.values), dense_shape=logits.dense_shape) coefs = tf.sparse_softmax(lrelu) if coef_drop != 0.0: coefs = tf.SparseTensor(indices=coefs.indices, values=tf.nn.dropout( coefs.values, 1.0 - coef_drop), dense_shape=coefs.dense_shape) if in_drop != 0.0: seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop) coefs = tf.sparse_reshape(coefs, [nb_nodes, nb_nodes]) seq_fts = tf.squeeze(seq_fts) vals = tf.sparse_tensor_dense_matmul(coefs, seq_fts) vals = tf.expand_dims(vals, axis=0) vals.set_shape([1, nb_nodes, out_sz]) ret = tf.contrib.layers.bias_add(vals) return activation(ret) # activation
def loop(curr_sample, new_h): # weight values are the sum of entropy and kl features W_vals = tf.gather(col_entropies, curr_sample, axis=1) + \ tf.gather(kl_mat, curr_sample, axis=1) W_shape = [self.num_nodes, self.num_nodes] W = tf.SparseTensor(self.indices, W_vals, W_shape) Wnorm = tf.sparse_softmax(W) # propagate labels new_sample = tf.sparse_tensor_dense_matmul( Wnorm, tf.gather(h, curr_sample, axis=1)) # append to result new_h = tf.concat([new_h, [new_sample]], 0) return [curr_sample + 1, new_h]
def add_sparse_att_layer(self, inlayer, dual_layer): dual_transform = tf.reshape( tf.layers.conv1d(tf.expand_dims(dual_layer, 0), 1, 1), (-1, 1)) logits = tf.reshape( tf.nn.embedding_lookup(dual_transform, self.r_mat.values), [-1]) lrelu = tf.SparseTensor(indices=self.r_mat.indices, values=tf.nn.leaky_relu(logits), dense_shape=self.r_mat.dense_shape) coefs = tf.sparse_softmax(lrelu) vals = tf.sparse_tensor_dense_matmul(coefs, inlayer) if self.act_func is None: return vals else: return self.act_func(vals)
def evaluate(self, G_holdout, Y_holdout): """ Perform cross validation on the hold-out set. This calculates the mean absolute error. Parameters ---------- G_holdout : tf.Tensor Sample metadata for the hold-out test dataset Y_holdout : tf.Tensor Dense feature table for the hold-out test dataset Returns ------- mad : tf.Tensor Mean absolute deviation. This represents the average error for each cell value in the matrix. """ with tf.name_scope('evaluate'): # evaluate the accuracy holdout_count = tf.cast(tf.sparse_reduce_sum(Y_holdout, axis=1), dtype=tf.float32) obs_ids = tf.gather(Y_holdout.indices, 1, axis=1) samp_ids = tf.gather(Y_holdout.indices, 0, axis=1) g_data = tf.gather(G_holdout, samp_ids, axis=0) # Calculate predicted abundance Gpos = tf.concat([tf.ones([g_data.shape[0], 1]), g_data], axis=1, name='g_holdout') Vprime = tf.transpose(tf.gather(self.V, obs_ids, axis=1), name='V_holdout') # sparse matrix multiplication for positive samples y_pred = tf.reduce_sum(tf.multiply(Gpos, Vprime), axis=1) smax = tf.SparseTensorValue(indices=Y_holdout.indices, values=y_pred, dense_shape=Y_holdout.dense_shape) smax = tf.sparse_softmax(smax) holdout_count = tf.gather(holdout_count, samp_ids, axis=0) pred_values = tf.cast(tf.multiply(holdout_count, smax.values), tf.float32) Y_values = tf.cast(Y_holdout.values, tf.float32) mse = tf.reduce_mean(tf.squeeze(tf.abs(pred_values - Y_values))) return mse