def noisy_activation(x, generic, linearized, training, alpha=1.1, c=0.5): """ Implements the noisy activation with Half-Normal Noise for Hard-Saturation functions. See http://arxiv.org/abs/1603.00391, Algorithm 1. Args: x: Tensor which is an input to the activation function generic: The generic formulation of the activation function. (denoted as h in the paper) linearized: Linearization of the activation based on the first-order Tailor expansion around zero. (denoted as u in the paper) training: A boolean tensor telling whether we are in the training stage (and the noise is sampled) or in runtime when the expactation is used instead. alpha: Mixing hyper-parameter. The leakage rate from the linearized function to the nonlinear one. c: Standard deviation of the sampled noise. """ delta = generic(x) - linearized(x) d = -tf.sign(x) * tf.sign(1 - alpha) p = tf.Variable(1.0) scale = c * (tf.sigmoid(p * delta) - 0.5) ** 2 noise = tf.select(training, tf.abs(tf.random_normal([])), math.sqrt(2 / math.pi)) activation = alpha * generic(x) + (1 - alpha) * linearized(x) + d * scale * noise return activation
def ternary_operation(x): """Ternary operation use threshold computed with weights.""" g = tf.compat.v1.get_default_graph() with g.gradient_override_map({"Sign": "Identity"}): threshold = _compute_threshold(x) x = tf.sign(tf.add(tf.sign(tf.add(x, threshold)), tf.sign(tf.add(x, -threshold)))) return x
def get_accuracy_loss(arg,x,y,y_): ''' Note: when the task is regression accuracy = loss but for classification loss = cross_entropy,svm_loss, surrogate_loss, etc and accuracy = 1 - {0-1 loss}. ''' with tf.name_scope("loss_and_acc") as scope: # loss if arg.softmax: #cross_entropy = tf.reduce_mean(-tf.rduce_sum(y_ * tf.log(y), reduction_indices=[1])) diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y) cross_entropy = tf.reduce_mean(diff) loss = cross_entropy correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) # list of booleans indicating correct predictions accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) else: l2_loss = tf.reduce_sum( tf.reduce_mean(tf.square(y_-y), 0)) loss = l2_loss y = tf.cast(tf.sign(y),tf.float32) y_ = tf.cast(tf.sign(y_),tf.float32) correct_prediction = tf.equal(y, y_) # list of booleans indicating correct predictions accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # accuracy # if arg.classification: # correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) # list of booleans indicating correct predictions # accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # else: # accuracy = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) ## tf.summary.scalar('loss', loss) tf.summary.scalar('accuracy', accuracy) return loss, accuracy
def loss_func(logits): final_maps = tf.placeholder(tf.float32, shape=[None, 361]) # final maps are originally -1 to 1. rescale them to 0 to 1 probabilities: final_prob_maps = final_maps * tf.constant(0.5) + tf.constant(0.5) cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, targets=final_prob_maps) cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy_mean') correct_prediction = tf.equal(tf.sign(logits), tf.sign(final_maps)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) return final_maps, cross_entropy_mean, accuracy
def loss_func(score_op): final_scores = tf.placeholder(tf.float32, shape=[None]) squared_errors = tf.square(tf.reshape(score_op, [-1]) - final_scores) #mean_sq_err = tf.reduce_mean(squared_errors, name='mean_sq_err') cross_entropy_ish_loss = tf.reduce_mean(-tf.log(tf.constant(1.0) - tf.constant(0.5) * tf.abs(tf.reshape(score_op, [-1]) - final_scores), name='cross-entropy-ish-loss')) correct_prediction = tf.equal(tf.sign(tf.reshape(score_op, [-1])), tf.sign(final_scores)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy') #return final_scores, mean_sq_err, accuracy, squared_errors return final_scores, cross_entropy_ish_loss, accuracy
def angular_symmetry(self, d_cutoff, d, atom_numbers, coordinates): """ Angular Symmetry Function """ max_atoms = self.max_atoms embedding = tf.eye(np.max(self.atom_cases) + 1) atom_numbers_embedded = tf.nn.embedding_lookup(embedding, atom_numbers) Rs = np.linspace(0., self.angular_cutoff, self.angular_length) ita = 3 / (Rs[1] - Rs[0])**2 thetas = np.linspace(0., np.pi, self.angular_length) zeta = float(self.angular_length**2) ita, zeta, Rs, thetas = np.meshgrid(ita, zeta, Rs, thetas) zeta = tf.cast(np.reshape(zeta, (1, 1, 1, 1, -1)), tf.float32) ita = tf.cast(np.reshape(ita, (1, 1, 1, 1, -1)), tf.float32) Rs = tf.cast(np.reshape(Rs, (1, 1, 1, 1, -1)), tf.float32) thetas = tf.cast(np.reshape(thetas, (1, 1, 1, 1, -1)), tf.float32) length = zeta.get_shape().as_list()[-1] vector_distances = tf.stack([coordinates] * max_atoms, 1) - tf.stack( [coordinates] * max_atoms, 2) R_ij = tf.stack([d] * max_atoms, axis=3) R_ik = tf.stack([d] * max_atoms, axis=2) f_R_ij = tf.stack([d_cutoff] * max_atoms, axis=3) f_R_ik = tf.stack([d_cutoff] * max_atoms, axis=2) # Define angle theta = arccos(R_ij(Vector) dot R_ik(Vector)/R_ij(distance)/R_ik(distance)) vector_mul = tf.reduce_sum(tf.stack([vector_distances] * max_atoms, axis=3) * \ tf.stack([vector_distances] * max_atoms, axis=2), axis=4) vector_mul = vector_mul * tf.sign(f_R_ij) * tf.sign(f_R_ik) theta = tf.acos(tf.math.divide(vector_mul, R_ij * R_ik + 1e-5)) R_ij = tf.stack([R_ij] * length, axis=4) R_ik = tf.stack([R_ik] * length, axis=4) f_R_ij = tf.stack([f_R_ij] * length, axis=4) f_R_ik = tf.stack([f_R_ik] * length, axis=4) theta = tf.stack([theta] * length, axis=4) out_tensor = tf.pow((1. + tf.cos(theta - thetas)) / 2., zeta) * \ tf.exp(-ita * tf.square((R_ij + R_ik) / 2. - Rs)) * f_R_ij * f_R_ik * 2 if self.atomic_number_differentiated: out_tensors = [] for id_j, atom_type_j in enumerate(self.atom_cases): for atom_type_k in self.atom_cases[id_j:]: selected_atoms = tf.stack([atom_numbers_embedded[:, :, atom_type_j]] * max_atoms, axis=2) * \ tf.stack([atom_numbers_embedded[:, :, atom_type_k]] * max_atoms, axis=1) selected_atoms = tf.expand_dims( tf.expand_dims(selected_atoms, axis=1), axis=4) out_tensors.append( tf.reduce_sum(out_tensor * selected_atoms, axis=(2, 3))) return tf.concat(out_tensors, axis=2) else: return tf.reduce_sum(out_tensor, axis=(2, 3))
def main(): data, labels = input() # logits=inference() # loss_step=loss(logits) # train_step = train(loss_step,0.001) # sess=tf.Session() # sess.run(tf.global_variables_initializer()) # print(sess.run([w,b])) # labels.shape=(6000,1) # for i in range(1000): # sess.run(train_step,feed_dict={x_placehold:data,y_placehold:labels}) # wc,bc=sess.run([w,b],feed_dict={x_placehold:data,y_placehold:labels}) # print(wc,bc) labels.shape = (6000, 1) print(data) print(labels) print(data.shape) print(labels.shape) print(feed_dict('D')) sess = tf.Session() sess.run(tf.global_variables_initializer()) # sess.run(tf.sign(tf.matmul(x_placehold, w) + b), # feed_dict=feed_dict('D')) sess.run(tf.sign(tf.matmul(x_placehold, w) + b) - y_placehold, feed_dict=feed_dict()) sess.run(tf.square(tf.sign(tf.matmul(x_placehold, w) + b) - y_placehold), feed_dict=feed_dict()) sess.run(tf.reduce_sum(tf.square(tf.sign(tf.matmul(x_placehold, w) + b) - y_placehold)), feed_dict=feed_dict()) logits = tf.matmul(x_placehold, w) + b loss_op = tf.reduce_sum(tf.square(logits - y_placehold)) with tf.name_scope('loss'): tf.summary.scalar('error', loss_op) with tf.name_scope('w'): tf.summary.scalar('x', w[0, 0]) tf.summary.scalar('y', w[1, 0]) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter('./train', sess.graph) optimizer = tf.train.GradientDescentOptimizer(0.1) for i in range(1000): sess.run(optimizer.minimize(loss_op), feed_dict=feed_dict()) summary = sess.run(merged, feed_dict=feed_dict()) train_writer.add_summary(summary, i) wc, bc = sess.run([w, b], feed_dict=feed_dict()) print(wc, bc)
def neural_attention(embedding_dim=384, encoding_dim=128): embeddings = tf.Variable(tf.random_normal([vocab_size, embedding_dim], stddev=0.22), dtype=tf.float32) tf.contrib.layers.apply_regularization(tf.contrib.layers.l2_regularizer(1e-4), [embeddings]) with tf.variable_scope('encode'): with tf.variable_scope('X'): X_lens = tf.reduce_sum(tf.sign(tf.abs(X)), 1) embedded_X = tf.nn.embedding_lookup(embeddings, X) encoded_X = tf.nn.dropout(embedded_X, keep_prob) gru_cell = tf.contrib.rnn.core_rnn_cell.GRUCell(encoding_dim) outputs, output_states = tf.nn.bidirectional_dynamic_rnn(gru_cell, gru_cell, encoded_X, sequence_length=X_lens, dtype=tf.float32, swap_memory=True) encoded_X = tf.concat(outputs, 2) with tf.variable_scope('Q'): Q_lens = tf.reduce_sum(tf.sign(tf.abs(Q)), 1) embedded_Q = tf.nn.embedding_lookup(embeddings, Q) encoded_Q = tf.nn.dropout(embedded_Q, keep_prob) gru_cell = tf.contrib.rnn.core_rnn_cell.GRUCell(encoding_dim) outputs, output_states = tf.nn.bidirectional_dynamic_rnn(gru_cell, gru_cell, encoded_Q, sequence_length=Q_lens, dtype=tf.float32, swap_memory=True) encoded_Q = tf.concat(outputs, 2) W_q = tf.Variable(tf.random_normal([2 * encoding_dim, 4 * encoding_dim], stddev=0.22), dtype=tf.float32) b_q = tf.Variable(tf.random_normal([2 * encoding_dim, 1], stddev=0.22), dtype=tf.float32) W_d = tf.Variable(tf.random_normal([2 * encoding_dim, 6 * encoding_dim], stddev=0.22), dtype=tf.float32) b_d = tf.Variable(tf.random_normal([2 * encoding_dim, 1], stddev=0.22), dtype=tf.float32) g_q = tf.Variable(tf.random_normal([10 * encoding_dim, 2 * encoding_dim], stddev=0.22), dtype=tf.float32) g_d = tf.Variable(tf.random_normal([10 * encoding_dim, 2 * encoding_dim], stddev=0.22), dtype=tf.float32) with tf.variable_scope('attend') as scope: infer_gru = tf.contrib.rnn.core_rnn_cell.GRUCell(4 * encoding_dim) infer_state = infer_gru.zero_state(batch_size, tf.float32) for iter_step in range(8): if iter_step > 0: scope.reuse_variables() _, q_glimpse = glimpse(W_q, b_q, encoded_Q, infer_state) d_attention, d_glimpse = glimpse(W_d, b_d, encoded_X, tf.concat([infer_state, q_glimpse], 1 )) gate_concat = tf.concat([infer_state, q_glimpse, d_glimpse, q_glimpse * d_glimpse], 1) r_d = tf.sigmoid(tf.matmul(gate_concat, g_d)) r_d = tf.nn.dropout(r_d, keep_prob) r_q = tf.sigmoid(tf.matmul(gate_concat, g_q)) r_q = tf.nn.dropout(r_q, keep_prob) combined_gated_glimpse = tf.concat([r_q * q_glimpse, r_d * d_glimpse], 1) _, infer_state = infer_gru(combined_gated_glimpse, infer_state) return tf.to_float(tf.sign(tf.abs(X))) * d_attention
def one_bp_iteration(self, xe_v2c_pre_iter, H_sumC_to_V, H_sumV_to_C, xe_0): xe_tanh = tf.tanh(tf.to_double(tf.truediv(xe_v2c_pre_iter, [2.0]))) xe_tanh = tf.to_float(xe_tanh) xe_tanh_temp = tf.sign(xe_tanh) xe_sum_log_img = tf.matmul(H_sumC_to_V, tf.multiply(tf.truediv((1 - xe_tanh_temp), [2.0]), [3.1415926])) xe_sum_log_real = tf.matmul(H_sumC_to_V, tf.log(1e-8 + tf.abs(xe_tanh))) xe_sum_log_complex = tf.complex(xe_sum_log_real, xe_sum_log_img) xe_product = tf.real(tf.exp(xe_sum_log_complex)) xe_product_temp = tf.multiply(tf.sign(xe_product), -2e-7) xe_pd_modified = tf.add(xe_product, xe_product_temp) xe_v_sumc = tf.multiply(self.atanh(xe_pd_modified), [2.0]) xe_c_sumv = tf.add(xe_0, tf.matmul(H_sumV_to_C, xe_v_sumc)) return xe_v_sumc, xe_c_sumv
def _apply(self, grad, var, indices=None): lr = tf.cast(self._learning_rate_tensor, var.dtype.base_dtype) m = self.get_slot(var, "m") # m_t = beta1 * m + (1 - beta1) * g_t beta1_t = tf.cast(self._beta1_t, var.dtype.base_dtype) m_scaled_g_values = grad * (1 - beta1_t) m_t = tf.assign(m, m * beta1_t, use_locking=self._use_locking) with tf.control_dependencies([m_t]): m_t = self._assign_add(m, updates=m_scaled_g_values, indices=indices) # update = lr * grad * where(...) m_gathered = self._gather(m_t, indices=indices) ones = tf.ones_like(grad) update = lr * grad * tf.where(tf.equal(tf.sign(m_gathered), tf.sign(grad)), ones, ones * self._decrease_factor) var_update = self._assign_sub(ref=var, updates=update, indices=indices) return tf.group(*[var_update, m_t])
def _apply_dense(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad))) var_update = state_ops.assign_sub(var, lr_t * grad * tf.exp( tf.log(alpha_t) * tf.sign(grad) * tf.sign(m_t))) # Update 'ref' by subtracting 'value # Create an op that groups multiple operations. # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
def retrieve_seq_length_op(data): """ An op to compute the length of a sequence. 0 are masked. """ with tf.name_scope('GetLength'): used = tf.sign(tf.reduce_max(tf.abs(data), reduction_indices=2)) length = tf.reduce_sum(used, reduction_indices=1) length = tf.cast(length, tf.int32) return length
def proximal_step(train_op, lr): # Apply weight decay for the variables with l2 loss # If basenet weights are trained together, do not set a weight decay on the # conv layers of the basenet l2_op_list = [] l1_op_list = [] with tf.control_dependencies([train_op]): if L2_LOSS_WEIGHT > 0: for var in tf.get_collection(utils.WEIGHT_DECAY_KEY): assign_op = var.assign_add(- lr * tf.convert_to_tensor(L2_LOSS_WEIGHT) * var) l2_op_list.append(assign_op) print('\tL2 loss added: %s(strength: %f)' % (var.name, L2_LOSS_WEIGHT)) # Apply proximal gradient for the variables with l1 lasso loss # Non-negative weights constraint if L1_LOSS_WEIGHT > 0: for var in tf.get_collection(utils.LASSO_KEY): th_t = tf.fill(tf.shape(var), tf.convert_to_tensor(L1_LOSS_WEIGHT) * lr) zero_t = tf.zeros(tf.shape(var)) var_temp = var - th_t * tf.sign(var) assign_op = var.assign(tf.select(tf.less(var, th_t), zero_t, var_temp)) l1_op_list.append(assign_op) print('\tL1 loss added: %s(strength: %f)' % (var.name, L1_LOSS_WEIGHT)) with tf.control_dependencies(l2_op_list + l1_op_list): train_op = tf.no_op(name='proximal_step') return train_op
def SoftThreshold(t, threshold_ratio, name=None): """Soft-threshold a tensor by the mean value. Softthreshold each dimension-0 vector (for matrix it is each column) by the mean of absolute value multiplied by the threshold_ratio factor. Here we soft threshold each column as it corresponds to each unit in a layer. Args: t: the input tensor. threshold_ratio: the threshold ratio. name: the optional name for the returned tensor. Returns: the thresholded tensor, where each entry is soft-thresholded by threshold_ratio times the mean of the aboslute value of each column. """ assert threshold_ratio >= 0 with tf.op_scope([t, threshold_ratio], name, "soft_thresholding") as name: saved_shape = tf.shape(t) t2 = tf.reshape(t, tf.concat(0, [tf.slice(saved_shape, [0], [1]), -1])) t_abs = tf.abs(t2) t_x = tf.sign(t2) * tf.nn.relu(t_abs - (tf.reduce_mean(t_abs, [0], keep_dims=True) * threshold_ratio)) return tf.reshape(t_x, saved_shape, name=name)
def _binarizer(prebinary_codes, is_training): """Binarize compression logits. During training, add noise, as in https://arxiv.org/pdf/1611.01704.pdf. During eval, map [-1, 1] -> {-1, 1}. Args: prebinary_codes: Floating-point tensors corresponding to pre-binary codes. Shape is [batch, code_length]. is_training: A python bool. If True, add noise. If false, binarize. Returns: Binarized codes. Shape is [batch, code_length]. Raises: ValueError: If the shape of `prebinary_codes` isn't static. """ if is_training: # In order to train codes that can be binarized during eval, we add noise as # in https://arxiv.org/pdf/1611.01704.pdf. Another option is to use a # stochastic node, as in https://arxiv.org/abs/1608.05148. noise = tf.random_uniform( prebinary_codes.shape, minval=-1.0, maxval=1.0) return prebinary_codes + noise else: return tf.sign(prebinary_codes)
def speech_data_seq_len(self, data): ''' Assuming one-hot char matrix is batchsize x max speech length x vocab length, return sequence length for each char matrix ''' signed_data = tf.sign(tf.reduce_sum(tf.abs(data), reduction_indices=2)) length = tf.reduce_sum(signed_data, reduction_indices=1) return length
def __call__(self,x,keep_prob=1.0,seq_length=None): #__call__ is very efficient when the state of instance changes frequently with tf.variable_scope(self.name,reuse = self.reuse) as vs: self.fw_cell =tf.contrib.rnn.LSTMCell(self.cell_size,state_is_tuple=True,reuse=tf.get_variable_scope().reuse) self.fw_cell1 =tf.contrib.rnn.LSTMCell(self.cell_size,state_is_tuple=True,reuse=tf.get_variable_scope().reuse) self.bw_cell =tf.contrib.rnn.LSTMCell(self.cell_size,state_is_tuple=True,reuse=tf.get_variable_scope().reuse) self.bw_cell1 =tf.contrib.rnn.LSTMCell(self.cell_size,state_is_tuple=True,reuse=tf.get_variable_scope().reuse) self.fw_cells = tf.contrib.rnn.MultiRNNCell([self.fw_cell,self.fw_cell1],state_is_tuple=True) self.bw_cells = tf.contrib.rnn.MultiRNNCell([self.bw_cell,self.bw_cell1],state_is_tuple=True) if seq_length ==None: #get the real sequence length (suppose that the padding are zeros) used = tf.sign(tf.reduce_max(tf.abs(x),reduction_indices=2)) seq_length = tf.cast(tf.reduce_sum(used,reduction_indices=1),tf.int32) lstm_out,_,_ = tf.contrib.rnn.static_bidirectional_rnn(self.fw_cells,self.bw_cells,tf.unstack(tf.transpose(x,[1,0,2])),dtype=tf.float32,sequence_length=seq_length) lstm_out = tf.transpose(tf.stack(lstm_out),[1,0,2]) print 'lstm_out: ',lstm_out #shape(lstm_out) = (self.batch_size,sequence_length,2*cell_size) #if keep_prob < 1.: # lstm_out = tf.nn.dropout(lstm_out,keep_prob) if self.reuse is None: self.trainable_weights = vs.global_variables() self.reuse =True return lstm_out,seq_length
def encode(self, x, noise): x = tf.to_float(x) # we can't use tf.pow(..., 8.0) because of a high-error approximation # on TPU. Instead we square three times. x = tf.sign(x) * tf.square(tf.square(tf.square(tf.abs(x) * 128.0))) x = _to_bfloat16_unbiased(x, noise) return x
def triangle_wave(frequency): """Emit a triangle wave at the given frequency.""" xs = tf.reshape(tf.range(_samples(), dtype=tf.float32), [1, _samples(), 1]) ts = xs / FLAGS.sample_rate # # A triangle wave looks like this: # # /\ /\ # / \ / \ # \ / \ / # \/ \/ # # If we look at just half a period (the first four slashes in the # diagram above), we can see that it looks like a transformed absolute # value function. # # Let's start by computing the times relative to the start of each # half-wave pulse (each individual "mountain" or "valley", of which # there are four in the above diagram). half_pulse_index = ts * (frequency * 2) half_pulse_angle = half_pulse_index % 1.0 # in [0, 1] # # Now, we can see that each positive half-pulse ("mountain") has # amplitude given by A(z) = 0.5 - abs(z - 0.5), and then normalized: absolute_amplitude = (0.5 - tf.abs(half_pulse_angle - 0.5)) / 0.5 # # But every other half-pulse is negative, so we should invert these. half_pulse_parity = tf.sign(1 - (half_pulse_index % 2.0)) amplitude = half_pulse_parity * absolute_amplitude # # This is precisely the desired result, so we're done! return amplitude
def retrieve_seq_length_op(data): """An op to compute the length of a sequence. 0 are masked. """ with tf.name_scope('GetLength'): used = tf.sign(x=tf.reduce_max(tf.abs(data), axis=2)) length = tf.reduce_sum(input_tensor=used, axis=1) length = tf.cast(x=length, dtype=tf.int32) return length
def build(self): """ tensorflow computation graph for transform """ graph = tf.Graph() with graph.as_default(): self.inputs = tf.placeholder(tf.float32, shape=(None, self.max_atoms, 4)) atom_numbers = tf.cast(self.inputs[:, :, 0], tf.int32) flags = tf.sign(atom_numbers) flags = tf.cast( tf.expand_dims(flags, 1) * tf.expand_dims(flags, 2), tf.float32) coordinates = self.inputs[:, :, 1:] if self.coordinates_in_bohr: coordinates = coordinates * 0.52917721092 d = self.distance_matrix(coordinates, flags) d_radial_cutoff = self.distance_cutoff(d, self.radial_cutoff, flags) d_angular_cutoff = self.distance_cutoff(d, self.angular_cutoff, flags) radial_sym = self.radial_symmetry(d_radial_cutoff, d, atom_numbers) angular_sym = self.angular_symmetry(d_angular_cutoff, d, atom_numbers, coordinates) self.outputs = tf.concat( [ tf.cast(tf.expand_dims(atom_numbers, 2), tf.float32), radial_sym, angular_sym ], axis=2) return graph
def random_sign_uniform( shape, minval=None, maxval=None, dtype=tf.float32, seed=None): """Tensor with (possibly complex) random entries from a "sign Uniform". Letting `Z` be a random variable equal to `-1` and `1` with equal probability, Samples from this `Op` are distributed like ``` Z * X, where X ~ Uniform[minval, maxval], if dtype is real, Z * (X + iY), where X, Y ~ Uniform[minval, maxval], if dtype is complex. ``` Args: shape: `TensorShape` or Python list. Shape of the returned tensor. minval: `0-D` `Tensor` giving the minimum values. maxval: `0-D` `Tensor` giving the maximum values. dtype: `TensorFlow` `dtype` or Python dtype seed: Python integer seed for the RNG. Returns: `Tensor` with desired shape and dtype. """ dtype = tf.as_dtype(dtype) with tf.name_scope("random_sign_uniform"): unsigned_samples = random_uniform( shape, minval=minval, maxval=maxval, dtype=dtype, seed=seed) if seed is not None: seed += 12 signs = tf.sign(tf.random_uniform(shape, minval=-1., maxval=1., seed=seed)) return unsigned_samples * tf.cast(signs, unsigned_samples.dtype)
def loss(logits, labels): """Calculates Mean Pixel Error. Args: logits: Logits from inference(). labels: Labels from distorted_inputs or inputs(). 1-D tensor of shape [batch_size] Returns: Loss tensor of type float. """ labelValidity = tf.sign(labels, name='label_validity') minop = tf.sub(logits, labels, name='Diff_Op') absop = tf.abs(minop, name='Abs_Op') lossValues = tf.mul(labelValidity, absop, name='lossValues') loss_mean = tf.reduce_mean(lossValues, name='MeanPixelError') tf.add_to_collection('losses', loss_mean) return tf.add_n(tf.get_collection('losses'), name='total_loss'), loss_mean
def _non_linear_grad(cls, op, grad): LRP.logger.debug("Computing non-linear gradient with activation type {}".format(op.type)) op_out = op.outputs[0] op_in = op.inputs[0] stabilizer_epsilon = cls._eps * tf.sign(op_in) op_in += stabilizer_epsilon return grad * op_out / op_in
def binomial_sampling(self, pr): """ Binomial sampling of hidden units activations using a rejection method. Basic mechanics: 1) Extract a random number from a uniform distribution (g) and compare it with the unit's probability (pr) 2) Choose 0 if pr<g, 1 otherwise. It is convenient to implement this condtion using the relu function. Args: pr (tensor, float32): input conditional probability g (np.array, float32): uniform probability used for comparison Returns: h_sampled (tensor, float32): sampled units. The value is 1 if pr>g and 0 otherwise. """ np.random.seed(self.seed) # sample from a Bernoulli distribution with same dimensions as input distribution g = tf.convert_to_tensor(np.random.uniform(size=pr.shape[1]), dtype=tf.float32) # sample the value of the hidden units h_sampled = tf.nn.relu(tf.sign(pr - g)) return h_sampled
def __init__(self, length_batch, features_batch, labels_batch): self.labels_flat = tf.reshape(labels_batch, [-1]) self.labels_one_hot = tf.one_hot(labels_batch, 26) self.labels_one_hot_flat = tf.reshape(self.labels_one_hot, [-1, 26]) self.lstm = tf.nn.rnn_cell.BasicLSTMCell(128) self.lstm_outputs, _ = tf.nn.dynamic_rnn( self.lstm, features_batch, sequence_length=length_batch, time_major=False, dtype=tf.float32) self.flat_lstm_outputs = tf.reshape(self.lstm_outputs, [-1, 128]) self.outputs = tflearn.fully_connected(self.flat_lstm_outputs, 26) # mask out padding self.losses = tf.nn.softmax_cross_entropy_with_logits(self.outputs, self.labels_one_hot_flat) self.mask = tf.to_float(tf.sign(self.labels_flat)) self.masked_losses = self.mask * self.losses self.mean_loss = tf.reduce_sum(self.masked_losses / tf.reduce_sum(self.mask)) self.predictions = tf.argmax(self.outputs, 1) self.accurate = tf.equal(self.predictions, self.labels_flat) self.accuracy = tf.reduce_sum(tf.to_float(self.accurate) * self.mask) / tf.reduce_sum(self.mask) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.mean_loss, tvars), 5.0) self.train = tf.train.GradientDescentOptimizer(0.1).apply_gradients(zip(grads, tvars))
def configurable_params_turn_on(self, args, options): offset = float(options["offset"]) or 0.0 if "random" in args: onvalue = float(options["onvalue"]) or 1.0 n = tf.random_uniform([1], minval=-1, maxval=1) n += tf.constant(offset, dtype=tf.float32) return (tf.sign(n) + 1) /2 * tf.constant(float(options["onvalue"], dtype=tf.float32))
def NTanh(x, use_noise, alpha=1.05, c=0.5, half_normal=False): """ Noisy Hard Tanh Units: NAN without learning p ---------------------------------------------------- Arguments: x: tensorflow tensor variable, input of the function. use_noise: bool, whether to add noise or not to the activations, this is in particular useful for the test time, in order to disable the noise injection. c: float, standard deviation of the noise alpha: the leaking rate from the linearized function to the nonlinear one. """ threshold = 1.0 signs = tf.sign(x) delta = tf.abs(x) - threshold scale = c * (tf.sigmoid(delta**2) - 0.5)**2 if alpha > 1.0 and half_normal: scale *= -1.0 zeros=tf.zeros(tf.shape(x), dtype=tf.float32, name=None) def noise_func() :return tf.abs(tf.random_normal(tf.shape(x), mean=0.0, stddev=1.0, dtype=tf.float32)) def zero_func (): return zeros+ 0.797 if half_normal else zeros noise=tf.cond(use_noise,noise_func,zero_func) eps = scale * noise + alpha * delta z = x - signs * eps test=tf.cast(tf.greater_equal(tf.abs(x) , threshold),tf.float32) res = test * z + (1. - test) * HardTanh(x) return res
def __graph__(): """Building the inference graph""" with tf.name_scope('input'): # [BATCH_SIZE, NUM_FEATURES] x_input = tf.placeholder(dtype=tf.float32, shape=[None, self.num_features], name='x_input') # [BATCH_SIZE] y_input = tf.placeholder(dtype=tf.uint8, shape=[None], name='y_input') # [BATCH_SIZE, NUM_CLASSES] y_onehot = tf.one_hot(indices=y_input, depth=self.num_classes, on_value=1, off_value=-1, name='y_onehot') learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate') with tf.name_scope('training_ops'): with tf.name_scope('weights'): weight = tf.get_variable(name='weights', initializer=tf.random_normal([self.num_features, self.num_classes], stddev=0.01)) self.variable_summaries(weight) with tf.name_scope('biases'): bias = tf.get_variable(name='biases', initializer=tf.constant([0.1], shape=[self.num_classes])) self.variable_summaries(bias) with tf.name_scope('Wx_plus_b'): output = tf.matmul(x_input, weight) + bias tf.summary.histogram('pre-activations', output) with tf.name_scope('svm'): regularization = tf.reduce_mean(tf.square(weight)) hinge_loss = tf.reduce_mean(tf.square(tf.maximum(tf.zeros([self.batch_size, self.num_classes]), 1 - tf.cast(y_onehot, tf.float32) * output))) with tf.name_scope('loss'): loss = regularization + self.svm_c * hinge_loss tf.summary.scalar('loss', loss) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) with tf.name_scope('accuracy'): predicted_class = tf.sign(output) predicted_class = tf.identity(predicted_class, name='prediction') with tf.name_scope('correct_prediction'): correct = tf.equal(tf.argmax(predicted_class, 1), tf.argmax(y_onehot, 1)) with tf.name_scope('accuracy'): accuracy = tf.reduce_mean(tf.cast(correct, 'float')) tf.summary.scalar('accuracy', accuracy) merged = tf.summary.merge_all() self.x_input = x_input self.y_input = y_input self.y_onehot = y_onehot self.learning_rate = learning_rate self.loss = loss self.optimizer = optimizer self.output = output self.predicted_class = predicted_class self.accuracy = accuracy self.merged = merged
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """ 创建X模型 :param bert_config: bert 配置 :param is_training: :param input_ids: 数据的idx 表示 :param input_mask: :param segment_ids: :param labels: 标签的idx 表示 :param num_labels: 类别数量 :param use_one_hot_embeddings: :return: """ # 使用数据加载BertModel,获取对应的字embedding model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings ) # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size] embedding = model.get_sequence_output() max_seq_length = embedding.shape[1].value used = tf.sign(tf.abs(input_ids)) lengths = tf.reduce_sum(used, reduction_indices=1) # [batch_size] 大小的向量,包含了当前batch中的序列长度 blstm_crf = BLSTM_CRF(embedded_chars=embedding, hidden_unit=FLAGS.lstm_size, cell_type=FLAGS.cell, num_layers=FLAGS.num_layers, droupout_rate=FLAGS.droupout_rate, initializers=initializers, num_labels=num_labels, seq_length=max_seq_length, labels=labels, lengths=lengths, is_training=is_training) rst = blstm_crf.add_blstm_crf_layer() return rst
def f_epsilon(self, x): return tf.sign(x) * tf.sqrt(tf.abs(x))
def train(): # load data data, labels = load_data.extract_data('linearly_separable_data.csv') # creating testing and training set X_train, X_test, Y_train, Y_test = train_test_split(data, labels, test_size=0.25) train_data_node = tf.placeholder(tf.float32, shape=(None, 2)) train_label_node = tf.placeholder(tf.float32, shape=(None, 1)) # weight W = tf.Variable(tf.random_uniform([2, 1], -1.0, 1.0), name="W") b = tf.Variable(tf.zeros([1])) # y_value = [batch_size,1] y_value = tf.matmul(train_data_node, W) + b weight_loss = 0.5 * tf.reduce_sum(tf.square(W)) hinge_loss = tf.reduce_sum( tf.maximum(tf.zeros([BATCH_SIZE, 1]), 1 - train_label_node * y_value)) svm_loss = weight_loss + svmC * hinge_loss # for test hinge_loss_test = tf.reduce_sum( tf.maximum(tf.zeros([Test_Size, 1]), 1 - train_label_node * y_value)) svm_loss_test = weight_loss + svmC * hinge_loss_test # train global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(svm_loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # evaluation predicted_class = tf.sign(y_value) correct_prediction = tf.equal(train_label_node, predicted_class) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # runing the training with tf.Session() as sess: tf.initialize_all_variables().run() print('Initialized!') # generate batches batches = load_data.batch_iter(list(zip(X_train, Y_train)), BATCH_SIZE, NUM_EPOCHS) # batch count batch_count = 0 epoch = 1 print("Epoch " + str(epoch) + ":") for batch in batches: batch_count += 1 # train process x_batch, y_batch = zip(*batch) feed_dict = {train_data_node: x_batch, train_label_node: y_batch} _, step, losses = sess.run([train_op, global_step, svm_loss], feed_dict=feed_dict) # test process if (batch_count * BATCH_SIZE) % Train_Size == 0: epoch += 1 print("Epoch " + str(epoch) + ":") if batch_count % EVAL_FREQUENCY == 0: feed_dict = {train_data_node: X_test, train_label_node: Y_test} step, losses, acc = sess.run( [global_step, svm_loss_test, accuracy], feed_dict=feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, losses, acc))
commmon math functions ''' tf.abs() tf.ceil() tf.cos() tf.exp() tf.floor() tf.inv() tf.log() tf.maximum() tf.minimum() #tf.neg() tf.pow() tf.round() tf.rsqrt() tf.sign() tf.sin() tf.sqrt() tf.square() ''' special math functions ''' tf.digamma() tf.erf() tf.erfc() tf.igamma() tf.igammac() tf.lbeta() tf.lgamma() tf.squared_difference() '''
# Declare vector L2 'norm' function squared l2_norm = tf.reduce_sum(tf.square(A)) # Declare loss function # Loss = max(0, 1-pred*actual) + alpha * L2_norm(A)^2 # L2 regularization parameter, alpha alpha = tf.constant([0.01]) # Margin term in loss classification_term = tf.reduce_mean( tf.maximum(0., tf.subtract(1., tf.multiply(model_output, y_target)))) # Put terms together loss = tf.add(classification_term, tf.multiply(alpha, l2_norm)) # Declare prediction function prediction = tf.sign(model_output) accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, y_target), tf.float32)) # Declare optimizer my_opt = tf.train.GradientDescentOptimizer(0.01) train_step = my_opt.minimize(loss) # Initialize variables init = tf.global_variables_initializer() sess.run(init) # Training loop loss_vec = [] train_accuracy = [] test_accuracy = [] for i in range(500):
def __init__(self): path = remote_helper.get_remote_date( "https://www.flyai.com/m/uncased_L-24_H-1024_A-16.zip") data_root = os.path.splitext(path)[0] bert_config_file = os.path.join(data_root, 'bert_config.json') bert_config = modeling.BertConfig.from_json_file(bert_config_file) init_checkpoint = os.path.join(data_root, 'bert_model.ckpt') bert_vocab_file = os.path.join(data_root, 'vocab.txt') self.input_ids = tf.placeholder(tf.int32, shape=[None, None], name='input_ids') self.input_mask = tf.placeholder(tf.int32, shape=[None, None], name='input_masks') self.segment_ids = tf.placeholder(tf.int32, shape=[None, None], name='segment_ids') self.labels = tf.placeholder(tf.int32, shape=[ None, ], name="labels") self.is_training = tf.placeholder_with_default(False, shape=(), name='is_training') self.learning_rate = tf.placeholder_with_default(config.learning_rate, shape=(), name='learning_rate') # 创建bert模型 with tf.name_scope('Bert'): model = modeling.BertModel( config=bert_config, is_training=True, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, # 这里如果使用TPU 设置为True,速度会快些。使用CPU 或GPU 设置为False ,速度会快些。 use_one_hot_embeddings=False) # 这个获取每个token的output 输入数据[batch_size, seq_length, embedding_size] 如果做seq2seq 或者ner 用这个 # output_layer = model.get_sequence_output() tvars = tf.trainable_variables() # 加载BERT模型 (assignment_map, initialized_variable_names) = \ modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) # output_layer = model.get_pooled_output() # 这个获取句子的output # hidden_size = output_layer.shape[-1].value # 获取输出的维度 embedding = model.get_sequence_output() max_seq_length = embedding.shape[1].value used = tf.sign(tf.abs(self.input_ids)) lengths = tf.reduce_sum( used, reduction_indices=1) # [batch_size] 大小的向量,包含了当前batch中的序列长度 blstm_crf = BiLstmCrf(embedded_chars=embedding, max_seq_length=max_seq_length, labels=self.labels, lengths=lengths, is_training=self.is_training) self.loss, logits, trans, pred_ids = blstm_crf.add_blstm_crf_layer() with tf.variable_scope("predict"): self.pred = tf.Variable(pred_ids, name='pred') with tf.name_scope("train_op"): self.train_op = tf.train.AdamOptimizer( learning_rate=model_config.learning_rate).minimize(self.loss)
def accuracy(labels, predictions, weights): predictions = tf.nn.relu(tf.sign(predictions)) return tf.metrics.accuracy(labels, predictions, weights)
def __init__(self, config): print(config) self.config = config self.lr = config["lr"] self.char_dim = config["char_dim"] self.lstm_dim = config["lstm_dim"] self.seg_dim = config["seg_dim"] self.subtype_dim = config["subtype_dim"] self.num_tags = config["num_tags"] self.num_chars = config["num_char"] self.num_steps = config["num_steps"] self.num_segs = 14 self.num_subtypes = 51 self.seq_nums = 8 self.global_step = tf.Variable(0, trainable=False) self.best_dev_f1 = tf.Variable(0.0, trainable=False) self.best_test_f1 = tf.Variable(0.0, trainable=False) self.initializer = initializers.xavier_initializer() self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="ChatInputs") self.seg_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="SegInputs") self.subtype_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="SubInputs") self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name="Targets") self.doc_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None, self.num_steps], name="doc_inputs") self.dropout = tf.placeholder(dtype=tf.float32, name="Dropout") self.char_lookup = tf.get_variable( name="char_embedding", shape=[self.num_chars, self.char_dim], initializer=self.initializer) used = tf.sign(tf.abs(self.char_inputs)) length = tf.reduce_sum(used, reduction_indices=1) self.lengths = tf.cast(length, tf.int32) self.batch_size = tf.shape(self.char_inputs)[0] embedding = self.embedding_layer(self.char_inputs, self.seg_inputs, self.subtype_inputs, config) doc_embedding = self.doc_embedding_layer(self.doc_inputs, self.lstm_dim, self.lengths, config) lstm_inputs = tf.nn.dropout(embedding, self.dropout) lstm_outputs, lstm_states = self.biLSTM_layer(lstm_inputs, self.lstm_dim, self.lengths) lstm_outputs = tf.nn.dropout(lstm_outputs, self.dropout) sen_att_outputs = self.attention(lstm_outputs) doc_att_outputs = self.doc_attention(doc_embedding, lstm_states) gat_output = self.gate(sen_att_outputs, doc_att_outputs) outputs = tf.concat([embedding, gat_output], -1) lstm_outputs = self.LSTM_decoder(outputs, self.lstm_dim) # lstm_outputs = self.tag_attention(lstm_outputs) self.logits = self.project_layer(lstm_outputs) self.loss = self.loss_layer(self.logits, self.lengths) with tf.variable_scope("optimizer"): optimizer = self.config["optimizer"] if optimizer == "sgd": self.opt = tf.train.GradientDescentOptimizer(self.lr) elif optimizer == "adam": self.opt = tf.train.AdamOptimizer(self.lr) elif optimizer == "adgrad": self.opt = tf.train.AdagradOptimizer(self.lr) else: raise KeyError grads_vars = self.opt.compute_gradients(self.loss) capped_grads_vars = [[ tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v ] for g, v in grads_vars] self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
(x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 model = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28)), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(10, activation='softmax') ]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) model.fit(x_train, y_train, batch_size=32, epochs=5) model.evaluate(x_test, y_test, verbose=2) loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True) labels = tf.one_hot(y_test, 10) loss(model(x_test), labels) x = tf.convert_to_tensor(x_test) labels = tf.one_hot(y_test, 10) with tf.GradientTape() as tape: tape.watch(x) prediction = model(x) loss = loss(labels, prediction) grad = tape.gradient(loss, x) adv_x = x + 0.05 * tf.sign(grad) model.evaluate(adv_x, y_test, verbose=2)
def train(discriminator, data, test_data, config): tf.set_random_seed(int(config['random_seed'])) batch_size = config['batch_size'] epsilon = config['epsilon'] # perturbation error class_num = config['class_num'] # number of output classes pgd_iter = config['pgd_iter'] learning_rate = config['learning_rate'] weight_decay = config['weight_decay'] x_real = data[0] label = data[1] # Normalize to range [-1,1] x_real = 2. * x_real - 1. step_size = epsilon * 0.25 x = x_real + tf.random_uniform(x_real.shape, -epsilon, epsilon) for i in range(pgd_iter): d_out = discriminator(x) d_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=d_out, labels=tf.one_hot( label, class_num))) grad_x, = tf.gradients(d_loss, x) x = tf.stop_gradient(x + step_size * tf.sign(grad_x)) x = tf.clip_by_value(x, x_real - epsilon, x_real + epsilon) x = tf.clip_by_value(x, -1.0, 1.0) d_out_adv = discriminator(x) d_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=d_out_adv, labels=tf.one_hot( label, class_num))) d_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='discriminator') # Weight decay: assume weights are named 'kernel' or 'weights' d_decay = weight_decay * 0.5 * sum( tf.reduce_sum(tf.square(v)) for v in d_vars if (v.name.find('kernel') > 0 or v.name.find('weights') > 0)) # SGD optimizer with different step sizes optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) optimizer2 = tf.train.MomentumOptimizer(learning_rate=learning_rate * 0.1, momentum=0.9) d_grads = tf.gradients(d_loss + d_decay, d_vars) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(zip(d_grads, d_vars)) # Gradient norm to evaluate convergence d_reg = 0.5 * sum(tf.reduce_sum(tf.square(g)) for g in d_grads) # build test acc, acc_update, acc_init = build_test(discriminator, test_data, config) acc_fgs, acc_update_fgs, acc_init_fgs = build_test_fgs( discriminator, test_data, config) acc_pgd, acc_update_pgd, acc_init_pgd = build_test_pgd( discriminator, test_data, config) saver = tf.train.Saver( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='discriminator')) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) train_size = config['train_size'] num_steps_per_epoch = int(train_size / batch_size) + 1 for batch_idx in range(config['nsteps']): d_loss_out, d_reg_out, _ = sess.run([d_loss, d_reg, train_op]) if batch_idx % num_steps_per_epoch == 0: test_acc = run_test(acc, acc_update, acc_init, sess, config) test_acc_fgs = run_test(acc_fgs, acc_update_fgs, acc_init_fgs, sess, config) test_acc_pgd = run_test(acc_pgd, acc_update_pgd, acc_init_pgd, sess, config) print( 'i=%d, Loss_d: %4.4f, test_acc: %.4f, fgs_acc: %.4f pgd_acc: %.4f d_reg: %.4f' % (batch_idx, d_loss_out, test_acc, test_acc_fgs, test_acc_pgd, d_reg_out)) model_filename = config['model_file'] saver.save(sess, model_filename)
def bernoulli(self, p): return tf.nn.relu(tf.sign(p - tf.random_uniform(p.shape)))
def __length(sequence): used = tf.sign(tf.reduce_max(tf.abs(sequence), 2)) length = tf.reduce_sum(used, 1) length = tf.cast(length, tf.int32) return length
def Transformer_match(context, query, context_mask, query_mask, num_units=None, num_heads=1, dropout_keep_rate=1.0, causality=False, scope='MultiHead_Attention_Block', reuse=None, residual=False, normalize_output=False, **kwargs): """Applies multihead attention. Args: context: A 3d tensor with shape of [N, T_q, C_q]. query: A 3d tensor with shape of [N, T_k, C_k]. num_units: A scalar. Attention size. dropout_rate: A floating point number. is_training: Boolean. Controller of mechanism for dropout. causality: Boolean. If true, units that reference the future are masked. num_heads: An int. Number of heads. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns A 3d tensor with shape of (N, T_q, C) """ if num_units is None or residual: num_units = context.get_shape().as_list()[-1] with tf.variable_scope(scope, reuse=reuse): # Set the fall back option for num_units # Linear projections Q = tf.layers.dense(context, num_units, activation=tf.nn.relu) # (N, T_q, C) K = tf.layers.dense(query, num_units, activation=tf.nn.relu) # (N, T_k, C) V = tf.layers.dense(query, num_units, activation=tf.nn.relu) # (N, T_k, C) # Split and concat Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) # Multiplication outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) # Scale outputs = outputs / (K_.get_shape().as_list()[-1]**0.5) # Key Masking, aka query if query_mask is None: query_mask = tf.sign(tf.abs(tf.reduce_sum(query, axis=-1))) # (N, T_k) mask1 = tf.tile(query_mask, [num_heads, 1]) # (h*N, T_k) mask1 = tf.tile(tf.expand_dims(mask1, 1), [1, tf.shape(context)[1], 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(outputs) * (-2**32 + 1) outputs = tf.where(tf.equal(mask1, 0), paddings, outputs) # (h*N, T_q, T_k) # Causality = Future blinding if causality: diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k) tril = tf.contrib.linalg.LinearOperatorLowerTriangular( diag_vals).to_dense() # (T_q, T_k) masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(masks) * (-2**32 + 1) outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k) # Activation outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) # Query Masking aka, context if context_mask is None: context_mask = tf.sign(tf.abs(tf.reduce_sum(context, axis=-1))) # (N, T_q) mask2 = tf.tile(context_mask, [num_heads, 1]) # (h*N, T_q) mask2 = tf.tile(tf.expand_dims(mask2, -1), [1, 1, tf.shape(query)[1]]) # (h*N, T_q, T_k) outputs *= mask2 # (h*N, T_q, T_k) # Dropouts outputs = tf.nn.dropout(outputs, keep_prob=dropout_keep_rate) # Weighted sum outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) # Restore shape outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C) if residual: # Residual connection outputs += context if normalize_output: # Normalize outputs = layer_norm(outputs) # (N, T_q, C) return outputs
def _length(self): mask = tf.sign(tf.reduce_max(tf.abs(self._x), 2)) length = tf.reduce_sum(mask, 1) length = tf.cast(length, tf.int32) return mask, length
def sample_prob(self, probs): # 随机采样 return tf.nn.relu(tf.sign(probs - tf.random_uniform(tf.shape(probs))))
def _length(seq): relevant = tf.sign(tf.abs(seq)) length = tf.reduce_sum(relevant, reduction_indices=1) length = tf.cast(length, tf.int32) return length
#Check to see if we finished adding in the amount of users for training if amountOfUsedUsers == 0: break amountOfUsedUsers -= 1 hiddenUnits = 20 visibleUnits = len(movies_df) vb = tf.placeholder("float", [visibleUnits]) #Number of unique movies hb = tf.placeholder("float", [hiddenUnits]) #Number of features we're going to learn W = tf.placeholder("float", [visibleUnits, hiddenUnits]) #Phase 1: Input Processing v0 = tf.placeholder("float", [None, visibleUnits]) _h0 = tf.nn.sigmoid(tf.matmul(v0, W) + hb) h0 = tf.nn.relu(tf.sign(_h0 - tf.random_uniform(tf.shape(_h0)))) #Phase 2: Reconstruction _v1 = tf.nn.sigmoid(tf.matmul(h0, tf.transpose(W)) + vb) v1 = tf.nn.relu(tf.sign(_v1 - tf.random_uniform(tf.shape(_v1)))) h1 = tf.nn.sigmoid(tf.matmul(v1, W) + hb) #Learning rate alpha = 1.0 #Create the gradients w_pos_grad = tf.matmul(tf.transpose(v0), h0) w_neg_grad = tf.matmul(tf.transpose(v1), h1) #Calculate the Contrastive Divergence to maximize CD = (w_pos_grad - w_neg_grad) / tf.to_float(tf.shape(v0)[0]) #Create methods to update the weights and biases update_w = W + alpha * CD update_vb = vb + alpha * tf.reduce_mean(v0 - v1, 0)
def multihead_attention(queries, keys, num_units=None, num_heads=8, dropout_rate=0, is_training=True, causality=False, scope="multihead_attention", reuse=None): '''Applies multihead attention. Args: queries: A 3d tensor with shape of [N, T_q, C_q]. keys: A 3d tensor with shape of [N, T_k, C_k]. num_units: A scalar. Attention size. dropout_rate: A floating point number. is_training: Boolean. Controller of mechanism for dropout. causality: Boolean. If true, units that reference the future are masked. num_heads: An int. Number of heads. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns A 3d tensor with shape of (N, T_q, C) ''' with tf.variable_scope(scope, reuse=reuse): # Set the fall back option for num_units if num_units is None: num_units = queries.get_shape().as_list[-1] # Linear projections Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C) K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) # Split and concat Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) # Multiplication outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) # Scale outputs = outputs / (K_.get_shape().as_list()[-1]**0.5) # Key Masking key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k) key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k) key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(outputs) * (-2**32 + 1) outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k) # Causality = Future blinding if causality: diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k) tril = tf.contrib.linalg.LinearOperatorTriL( diag_vals).to_dense() # (T_q, T_k) masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(masks) * (-2**32 + 1) outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k) # Activation outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) # Query Masking query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q) query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) outputs *= query_masks # broadcasting. (N, T_q, C) # Alignments alignments = tf.transpose(outputs, [0, 2, 1]) # Dropouts outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) # Weighted sum outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) # Restore shape outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C) # Residual connection outputs += queries # Normalize outputs = normalize(outputs) # (N, T_q, C) return outputs, alignments