def get_linear_classifier(self, layer, token_weights, last_output=None, variable_scope=None, reuse=False): """""" if last_output is not None: n_layers = 0 layer = last_output['hidden_layer'] recur_layer = last_output['recur_layer'] else: n_layers = self.n_layers recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, n_layers): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('Classifier'): logits = classifiers.linear_classifier(layer, len(self), hidden_keep_prob=hidden_keep_prob) targets = self.placeholder #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x c) -> (n x m x c) probabilities = tf.nn.softmax(logits) # (n x m), (n x m x c), (n x m) -> () loss = tf.losses.sparse_softmax_cross_entropy(targets, logits, weights=token_weights) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x c) -> (n x m) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # (n x m) (*) (n x m) -> (n x m) correct_tokens = nn.equal(targets, predictions) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['hidden_layer'] = layer outputs['targets'] = targets outputs['probabilities'] = probabilities outputs['loss'] = loss outputs['predictions'] = predictions outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_bilinear_classifier(self, layer, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob hidden_func = self.hidden_func hidden_size = self.hidden_size add_linear = self.add_linear linearize = self.linearize distance = self.distance n_splits = 2 * (1 + linearize + distance) with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers - 1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden( layer, n_splits * hidden_size, hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, n_splits * [hidden_size], hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) if linearize: lin_layer1, lin_layer2 = layers.pop(0), layers.pop(0) if distance: dist_layer1, dist_layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Attention'): if self.diagonal: logits, _ = classifiers.diagonal_bilinear_attention( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.diagonal_bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if distance: with tf.variable_scope('Distance'): dist_lamda = 1 + tf.nn.softplus( classifiers.diagonal_bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) else: logits, _ = classifiers.bilinear_attention( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if distance: with tf.variable_scope('Distance'): dist_lamda = 1 + tf.nn.softplus( classifiers.bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) #----------------------------------------------------------- # Process the targets targets = self.placeholder shape = tf.shape(layer1) batch_size, bucket_size = shape[0], shape[1] # (1 x m) ids = tf.expand_dims(tf.range(bucket_size), 0) # (1 x m) -> (1 x 1 x m) head_ids = tf.expand_dims(ids, -2) # (1 x m) -> (1 x m x 1) dep_ids = tf.expand_dims(ids, -1) if linearize: # Wherever the head is to the left # (n x m), (1 x m) -> (n x m) lin_targets = tf.to_float(tf.less(targets, ids)) # cross-entropy of the linearization of each i,j pair # (1 x 1 x m), (1 x m x 1) -> (n x m x m) lin_ids = tf.tile(tf.less(head_ids, dep_ids), [batch_size, 1, 1]) # (n x 1 x m), (n x m x 1) -> (n x m x m) lin_xent = -tf.nn.softplus( tf.where(lin_ids, -lin_logits, lin_logits)) # add the cross-entropy to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(lin_xent) if distance: # (n x m) - (1 x m) -> (n x m) dist_targets = tf.abs(targets - ids) # KL-divergence of the distance of each i,j pair # (1 x 1 x m) - (1 x m x 1) -> (n x m x m) dist_ids = tf.to_float( tf.tile(tf.abs(head_ids - dep_ids), [batch_size, 1, 1])) + 1e-12 # (n x m x m), (n x m x m) -> (n x m x m) #dist_kld = (dist_ids * tf.log(dist_lamda / dist_ids) + dist_ids - dist_lamda) dist_kld = -tf.log((dist_ids - dist_lamda)**2 / 2 + 1) # add the KL-divergence to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(dist_kld) #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m) + (m) -> (n x m) non_pads = tf.to_float(token_weights) + tf.to_float( tf.logical_not( tf.cast(tf.range(bucket_size), dtype=tf.bool))) # (n x m x m) o (n x 1 x m) -> (n x m x m) probabilities = tf.nn.softmax(logits) * tf.expand_dims( non_pads, -2) # (n x m), (n x m x m), (n x m) -> () loss = tf.losses.sparse_softmax_cross_entropy( targets, logits, weights=token_weights) # (n x m) -> (n x m x m x 1) one_hot_targets = tf.expand_dims( tf.one_hot(targets, bucket_size), -1) # (n x m) -> () n_tokens = tf.to_float(tf.reduce_sum(token_weights)) if linearize: # (n x m x m) -> (n x m x 1 x m) lin_xent_reshaped = tf.expand_dims(lin_xent, -2) # (n x m x 1 x m) * (n x m x m x 1) -> (n x m x 1 x 1) lin_target_xent = tf.matmul(lin_xent_reshaped, one_hot_targets) # (n x m x 1 x 1) -> (n x m) lin_target_xent = tf.squeeze(lin_target_xent, [-1, -2]) # (n x m), (n x m), (n x m) -> () loss -= tf.reduce_sum( lin_target_xent * tf.to_float(token_weights)) / (n_tokens + 1e-12) if distance: # (n x m x m) -> (n x m x 1 x m) dist_kld_reshaped = tf.expand_dims(dist_kld, -2) # (n x m x 1 x m) * (n x m x m x 1) -> (n x m x 1 x 1) dist_target_kld = tf.matmul(dist_kld_reshaped, one_hot_targets) # (n x m x 1 x 1) -> (n x m) dist_target_kld = tf.squeeze(dist_target_kld, [-1, -2]) # (n x m), (n x m), (n x m) -> () loss -= tf.reduce_sum( dist_target_kld * tf.to_float(token_weights)) / (n_tokens + 1e-12) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x m) -> (n x m) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # (n x m) (*) (n x m) -> (n x m) correct_tokens = nn.equal(targets, predictions) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = self.placeholder outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = loss outputs['loss'] = loss outputs['unlabeled_predictions'] = predictions outputs['predictions'] = predictions outputs['correct_unlabeled_tokens'] = correct_tokens outputs['n_correct_unlabeled_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_unlabeled_sequences'] = tf.reduce_sum( correct_sequences) outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_bilinear_discriminator(self, layer, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear n_splits = 2 * (1 + self.linearize + self.distance) with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers - 1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden( layer, n_splits * self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, n_splits * [self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) if self.linearize: lin_layer1, lin_layer2 = layers.pop(0), layers.pop(0) if self.distance: dist_layer1, dist_layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Discriminator'): if self.diagonal: logits = classifiers.diagonal_bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.diagonal_bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.distance: with tf.variable_scope('Distance'): dist_lamda = 1 + tf.nn.softplus( classifiers.diagonal_bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) else: logits = classifiers.bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.distance: with tf.variable_scope('Distance'): dist_lamda = 1 + tf.nn.softplus( classifiers.bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) #----------------------------------------------------------- # Process the targets # (n x m x m) -> (n x m x m) unlabeled_targets = self.placeholder shape = tf.shape(layer1) batch_size, bucket_size = shape[0], shape[1] # (1 x m) ids = tf.expand_dims(tf.range(bucket_size), 0) # (1 x m) -> (1 x 1 x m) head_ids = tf.expand_dims(ids, -2) # (1 x m) -> (1 x m x 1) dep_ids = tf.expand_dims(ids, -1) if self.linearize: # Wherever the head is to the left # (n x m x m), (1 x m x 1) -> (n x m x m) lin_targets = tf.to_float( tf.less(unlabeled_targets, dep_ids)) # cross-entropy of the linearization of each i,j pair # (1 x 1 x m), (1 x m x 1) -> (n x m x m) lin_ids = tf.tile(tf.less(head_ids, dep_ids), [batch_size, 1, 1]) # (n x 1 x m), (n x m x 1) -> (n x m x m) lin_xent = -tf.nn.softplus( tf.where(lin_ids, -lin_logits, lin_logits)) # add the cross-entropy to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(lin_xent) if self.distance: # (n x m x m) - (1 x m x 1) -> (n x m x m) dist_targets = tf.abs(unlabeled_targets - dep_ids) # KL-divergence of the distance of each i,j pair # (1 x 1 x m) - (1 x m x 1) -> (n x m x m) dist_ids = tf.to_float( tf.tile(tf.abs(head_ids - dep_ids), [batch_size, 1, 1])) + 1e-12 # (n x m x m), (n x m x m) -> (n x m x m) #dist_kld = (dist_ids * tf.log(dist_lamda / dist_ids) + dist_ids - dist_lamda) dist_kld = -tf.log((dist_ids - dist_lamda)**2 / 2 + 1) # add the KL-divergence to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(dist_kld) #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m) -> (n x m x m) probabilities = tf.nn.sigmoid(logits) * tf.to_float( token_weights) # (n x m x m), (n x m x m), (n x m x m) -> () loss = tf.losses.sigmoid_cross_entropy(unlabeled_targets, logits, weights=token_weights) n_tokens = tf.to_float(tf.reduce_sum(token_weights)) if self.linearize: lin_target_xent = lin_xent * unlabeled_targets loss -= tf.reduce_sum( lin_target_xent * tf.to_float(token_weights)) / (n_tokens + 1e-12) if self.distance: dist_target_kld = dist_kld * unlabeled_targets loss -= tf.reduce_sum( dist_target_kld * tf.to_float(token_weights)) / (n_tokens + 1e-12) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x m) -> (n x m x m) predictions = nn.greater(logits, 0, dtype=tf.int32) * token_weights # (n x m x m) (*) (n x m x m) -> (n x m x m) true_positives = predictions * unlabeled_targets # (n x m x m) -> () n_predictions = tf.reduce_sum(predictions) n_targets = tf.reduce_sum(unlabeled_targets) n_true_positives = tf.reduce_sum(true_positives) # () - () -> () n_false_positives = n_predictions - n_true_positives n_false_negatives = n_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1, 2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1, 2]) # (n) x 2 -> () n_correct_sequences = tf.reduce_sum( nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['unlabeled_targets'] = unlabeled_targets outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = loss outputs['loss'] = loss outputs['unlabeled_predictions'] = predictions outputs['n_unlabeled_true_positives'] = n_true_positives outputs['n_unlabeled_false_positives'] = n_false_positives outputs['n_unlabeled_false_negatives'] = n_false_negatives outputs['n_correct_unlabeled_sequences'] = n_correct_sequences outputs['predictions'] = predictions outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences return outputs
def get_unfactored_bilinear_classifier(self, layer, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top' % i): layers = classifiers.hidden(layer, 2*[self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) #----------------------------------------------------------- # Process the targets targets = self.placeholder # (n x m x m) -> (n x m x m) unlabeled_targets = nn.greater(targets, 0) #----------------------------------------------------------- # Process the logits # (n x m x c x m) -> (n x m x m x c) transposed_logits = tf.transpose(logits, [0,1,3,2]) #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m x c) -> (n x m x m x c) probabilities = tf.nn.softmax(transposed_logits) * tf.to_float(tf.expand_dims(token_weights, axis=-1)) # (n x m x m), (n x m x m x c), (n x m x m) -> () loss = tf.losses.sparse_softmax_cross_entropy(targets, transposed_logits, weights=token_weights) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x m x c) -> (n x m x m) predictions = tf.argmax(transposed_logits, axis=-1, output_type=tf.int32) * token_weights # (n x m x m) -> (n x m x m) unlabeled_predictions = nn.greater(predictions, 0) # (n x m x m) (*) (n x m x m) -> (n x m x m) unlabeled_true_positives = unlabeled_predictions * unlabeled_targets true_positives = nn.equal(targets, predictions) * unlabeled_true_positives # (n x m x m) -> () n_predictions = tf.reduce_sum(unlabeled_predictions) n_targets = tf.reduce_sum(unlabeled_targets) n_unlabeled_true_positives = tf.reduce_sum(unlabeled_true_positives) n_true_positives = tf.reduce_sum(true_positives) # () - () -> () n_unlabeled_false_positives = n_predictions - n_unlabeled_true_positives n_unlabeled_false_negatives = n_targets - n_unlabeled_true_positives n_false_positives = n_predictions - n_true_positives n_false_negatives = n_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1,2]) n_unlabeled_true_positives_per_sequence = tf.reduce_sum(unlabeled_true_positives, axis=[1,2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1,2]) # (n) x 2 -> () n_correct_unlabeled_sequences = tf.reduce_sum(nn.equal(n_unlabeled_true_positives_per_sequence, n_targets_per_sequence)) n_correct_sequences = tf.reduce_sum(nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = unlabeled_targets outputs['label_targets'] = self.placeholder outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = tf.constant(0.) outputs['loss'] = loss outputs['unlabeled_predictions'] = unlabeled_predictions outputs['label_predictions'] = predictions outputs['n_unlabeled_true_positives'] = n_unlabeled_true_positives outputs['n_unlabeled_false_positives'] = n_unlabeled_false_positives outputs['n_unlabeled_false_negatives'] = n_unlabeled_false_negatives outputs['n_correct_unlabeled_sequences'] = n_correct_unlabeled_sequences outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences return outputs
def get_bilinear_discriminator(self, layer, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top' % i): layers = classifiers.hiddens(layer, 2*[self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Discriminator'): if self.diagonal: logits = classifiers.diagonal_bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) #----------------------------------------------------------- # Process the targets # (n x m x m) -> (n x m x m) unlabeled_targets = nn.greater(self.placeholder, 0) #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m) -> (n x m x m) probabilities = tf.nn.sigmoid(logits) # (n x m x m), (n x m x m x c), (n x m x m) -> () loss = tf.losses.sigmoid_cross_entropy(unlabeled_targets, logits, weights=token_weights) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x m x c) -> (n x m x m) predictions = nn.greater(logits, 0, dtype=tf.int32) * token_weights # (n x m x m) (*) (n x m x m) -> (n x m x m) true_positives = predictions * unlabeled_targets # (n x m x m) -> () n_predictions = tf.reduce_sum(predictions) n_targets = tf.reduce_sum(unlabeled_targets) n_true_positives = tf.reduce_sum(true_positives) # () - () -> () n_false_positives = n_predictions - n_true_positives n_false_negatives = n_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1,2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1,2]) # (n) x 2 -> () n_correct_sequences = tf.reduce_sum(nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = unlabeled_targets outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = loss outputs['loss'] = loss outputs['unlabeled_predictions'] = predictions outputs['n_unlabeled_true_positives'] = n_true_positives outputs['n_unlabeled_false_positives'] = n_false_positives outputs['n_unlabeled_false_negatives'] = n_false_negatives outputs['n_correct_unlabeled_sequences'] = n_correct_sequences outputs['predictions'] = predictions outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences return outputs
def get_unfactored_bilinear_classifier(self, layer, unlabeled_targets, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob hidden_func = self.hidden_func hidden_size = self.hidden_size add_linear = self.add_linear with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*hidden_size, hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, 2*[hidden_size], hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) bucket_size = tf.shape(layer)[-2] #------------------------------------------------------- # Process the targets # c (*) (n x m) + (n x m) #targets = len(self) * unlabeled_targets + self.placeholder targets = bucket_size * self.placeholder + unlabeled_targets #------------------------------------------------------- # Process the logits # (n x m x c x m) -> (n x m x cm) reshaped_logits = tf.reshape(logits, tf.stack([-1, bucket_size, bucket_size * len(self)])) #------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x cm) -> (n x m x cm) probabilities = tf.nn.softmax(reshaped_logits) # (n x m x cm) -> (n x m x c x m) probabilities = tf.reshape(probabilities, tf.stack([-1, bucket_size, len(self), bucket_size])) # (n x m x c x m) -> (n x m x m x c) probabilities = tf.transpose(probabilities, [0,1,3,2]) # (n x m), (n x m x cm), (n x m) -> () loss = tf.losses.sparse_softmax_cross_entropy(targets, reshaped_logits, weights=token_weights) #------------------------------------------------------- # Compute predictions/accuracy # (n x m x cm) -> (n x m) predictions = tf.argmax(reshaped_logits, axis=-1, output_type=tf.int32) # (n x m), () -> (n x m) unlabeled_predictions = tf.mod(predictions, bucket_size) # (n x m) (*) (n x m) -> (n x m) correct_tokens = nn.equal(predictions, targets) * token_weights correct_unlabeled_tokens = nn.equal(unlabeled_predictions, unlabeled_targets) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) correct_unlabeled_tokens_per_sequence = tf.reduce_sum(correct_unlabeled_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) correct_unlabeled_sequences = nn.equal(tokens_per_sequence, correct_unlabeled_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = unlabeled_targets outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = tf.constant(0.) outputs['loss'] = loss outputs['unlabeled_predictions'] = unlabeled_predictions outputs['label_predictions'] = predictions outputs['n_correct_unlabeled_tokens'] = tf.reduce_sum(correct_unlabeled_tokens) outputs['n_correct_unlabeled_sequences'] = tf.reduce_sum(correct_unlabeled_sequences) outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_bilinear_classifier(self, layer, outputs, token_weights, variable_scope=None, reuse=False): """""" layer1 = layer2 = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob hidden_func = self.hidden_func hidden_size = self.hidden_size add_linear = self.add_linear with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*hidden_size, hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, 2*[hidden_size], hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) bucket_size = tf.shape(layer)[-2] #------------------------------------------------------- # Process the targets # (n x m) label_targets = self.placeholder unlabeled_predictions = outputs['unlabeled_predictions'] unlabeled_targets = outputs['unlabeled_targets'] # (n x m) -> (n x m x m) unlabeled_predictions = tf.one_hot(unlabeled_predictions, bucket_size) unlabeled_targets = tf.one_hot(unlabeled_targets, bucket_size) # (n x m x m) -> (n x m x m x 1) unlabeled_predictions = tf.expand_dims(unlabeled_predictions, axis=-1) unlabeled_targets = tf.expand_dims(unlabeled_targets, axis=-1) #------------------------------------------------------- # Process the logits # We use the gold heads for computing the label score and the predicted # heads for computing the unlabeled attachment score # (n x m x c x m) -> (n x m x m x c) transposed_logits = tf.transpose(logits, [0,1,3,2]) # (n x m x c x m) * (n x m x m x 1) -> (n x m x c x 1) predicted_logits = tf.matmul(logits, unlabeled_predictions) oracle_logits = tf.matmul(logits, unlabeled_targets) # (n x m x c x 1) -> (n x m x c) predicted_logits = tf.squeeze(predicted_logits, axis=-1) oracle_logits = tf.squeeze(oracle_logits, axis=-1) #------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m) -> (n x m x m x 1) head_probabilities = tf.expand_dims(tf.stop_gradient(outputs['probabilities']), axis=-1) # (n x m x m x c) -> (n x m x m x c) label_probabilities = tf.nn.softmax(transposed_logits) # (n x m), (n x m x c), (n x m) -> () label_loss = tf.losses.sparse_softmax_cross_entropy(label_targets, oracle_logits, weights=token_weights) #------------------------------------------------------- # Compute predictions/accuracy # (n x m x c) -> (n x m) label_predictions = tf.argmax(predicted_logits, axis=-1, output_type=tf.int32) label_oracle_predictions = tf.argmax(oracle_logits, axis=-1, output_type=tf.int32) # (n x m) (*) (n x m) -> (n x m) correct_label_tokens = nn.equal(label_targets, label_oracle_predictions) * token_weights correct_tokens = nn.equal(label_targets, label_predictions) * outputs['correct_unlabeled_tokens'] # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_label_tokens_per_sequence = tf.reduce_sum(correct_label_tokens, axis=-1) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_label_sequences = nn.equal(tokens_per_sequence, correct_label_tokens_per_sequence) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary rho = self.loss_interpolation outputs['label_targets'] = label_targets # This way we can reconstruct the head_probabilities by exponentiating and summing along the last axis outputs['probabilities'] = label_probabilities * head_probabilities outputs['label_loss'] = label_loss outputs['loss'] = 2*((1-rho) * outputs['loss'] + rho * label_loss) outputs['label_predictions'] = label_predictions outputs['n_correct_label_tokens'] = tf.reduce_sum(correct_label_tokens) outputs['n_correct_label_sequences'] = tf.reduce_sum(correct_label_sequences) outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_sampled_linear_classifier(self, layer, n_samples, token_weights=None, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) batch_size, bucket_size, input_size = nn.get_sizes(layer) layer = nn.dropout(layer, hidden_keep_prob, noise_shape=[batch_size, 1, input_size]) layer = nn.reshape(layer, [-1, input_size]) with tf.variable_scope('Classifier'): # (s) samples, _, _ = tf.nn.log_uniform_candidate_sampler( nn.zeros([bucket_size,1], dtype=tf.int64), 1, n_samples, unique=True, range_max=len(self)) with tf.device('/gpu:1'): weights = tf.get_variable('Weights', shape=[len(self), input_size], initializer=tf.zeros_initializer) biases = tf.get_variable('Biases', shape=len(self), initializer=tf.zeros_initializer) tf.add_to_collection('non_save_variables', weights) tf.add_to_collection('non_save_variables', biases) # (nm x 1) targets = nn.reshape(self.placeholder, [-1, 1]) # (1 x s) samples = tf.expand_dims(samples, 0) # (nm x s) samples = tf.to_int32(nn.tile(samples, [batch_size*bucket_size, 1])) # (nm x s) sample_weights = tf.to_float(nn.not_equal(samples, targets)) # (nm x 1+s) cands = tf.stop_gradient(tf.concat([targets, samples], axis=-1)) # (nm x 1), (nm x s) -> (nm x 1+s) cand_weights = tf.stop_gradient(tf.concat([nn.ones([batch_size*bucket_size, 1]), sample_weights], axis=-1)) # (c x d), (nm x 1+s) -> (nm x 1+s x d) weights = tf.nn.embedding_lookup(weights, cands) # (c), (nm x 1+s) -> (nm x 1+s) biases = tf.nn.embedding_lookup(biases, cands) # (n x m x d) -> (nm x d x 1) layer_reshaped = nn.reshape(layer, [-1, input_size, 1]) # (nm x 1+s x d) * (nm x d x 1) -> (nm x 1+s x 1) logits = tf.matmul(weights, layer_reshaped) # (nm x 1+s x 1) -> (nm x 1+s) logits = tf.squeeze(logits, -1) #----------------------------------------------------------- # Compute probabilities/cross entropy # (nm x 1+s) logits = logits - tf.reduce_max(logits, axis=-1, keep_dims=True) # (nm x 1+s) exp_logits = tf.exp(logits) * cand_weights # (nm x 1) exp_logit_sum = tf.reduce_sum(exp_logits, axis=-1, keep_dims=True) # (nm x 1+s) probabilities = exp_logits / exp_logit_sum # (nm x 1+s) -> (n x m x 1+s) probabilities = nn.reshape(probabilities, [batch_size, bucket_size, 1+n_samples]) # (nm x 1+s) -> (n x m x 1+s) samples = nn.reshape(samples, [batch_size, bucket_size, 1+n_samples]) # (nm x 1+s) -> (nm x 1), (nm x s) target_logits, _ = tf.split(logits, [1, n_samples], axis=1) # (nm x 1) - (nm x 1) -> (nm x 1) loss = tf.log(exp_logit_sum) - target_logits # (n x m) -> (nm x 1) token_weights1D = tf.to_float(nn.reshape(token_weights, [-1,1])) # (nm x 1) -> () loss = tf.reduce_sum(loss*token_weights1D) / tf.reduce_sum(token_weights1D) #----------------------------------------------------------- # Compute predictions/accuracy # (nm x 1+s) -> (n x m x 1+s) logits = nn.reshape(logits, [batch_size, bucket_size, -1]) # (n x m x 1+s) -> (n x m) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # (n x m) (*) (n x m) -> (n x m) correct_tokens = nn.equal(predictions, 0) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['targets'] = targets outputs['probabilities'] = tf.tuple([samples, probabilities]) outputs['loss'] = loss outputs['predictions'] = predictions outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_linear_classifier(self, layer, token_weights, last_output=None, variable_scope=None, reuse=False): """""" if last_output: n_layers = 0 layer = last_output['hidden_layer'] recur_layer = last_output['recur_layer'] else: n_layers = self.n_layers recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('Classifier'): probabilities = [] loss = [] predictions = [] correct_tokens = [] for i, feat in enumerate(self._feats): vs_feat = str(feat).replace('[', '-RSB-').replace(']', '-LSB-') with tf.variable_scope(vs_feat): logits = classifiers.linear_classifier(layer, self.getlen(feat), hidden_keep_prob=hidden_keep_prob) targets = self.placeholder[:,:,i] #--------------------------------------------------- # Compute probabilities/cross entropy # (n x m x c) -> (n x m x c) probabilities.append(tf.nn.softmax(logits)) # (n x m), (n x m x c), (n x m) -> () loss.append(tf.losses.sparse_softmax_cross_entropy(targets, logits, weights=token_weights)) #--------------------------------------------------- # Compute predictions/accuracy # (n x m x c) -> (n x m) predictions.append(tf.argmax(logits, axis=-1, output_type=tf.int32)) # (n x m) (*) (n x m) -> (n x m) correct_tokens.append(nn.equal(targets, predictions[-1])) # (n x m) x f -> (n x m x f) predictions = tf.stack(predictions, axis=-1) # (n x m) x f -> (n x m x f) correct_tokens = tf.stack(correct_tokens, axis=-1) # (n x m x f) -> (n x m) correct_tokens = tf.reduce_prod(correct_tokens, axis=-1) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['targets'] = self.placeholder outputs['probabilities'] = probabilities outputs['loss'] = tf.add_n(loss) outputs['predictions'] = predictions outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs