def get_input_tensor(self, outputs, reuse=True): """""" output_keep_prob = 1. if reuse else self.output_keep_prob for output in outputs: pass # we just need to grab one layer = output['recur_layer'] with tf.variable_scope(self.classname): layer = classifiers.hiddens(layer, self.output_size, hidden_func=self.output_func, hidden_keep_prob=output_keep_prob, reuse=reuse) return [layer]
def get_bilinear_classifier(self, layer, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob hidden_func = self.hidden_func hidden_size = self.hidden_size add_linear = self.add_linear linearize = self.linearize distance = self.distance n_splits = 2*(1+linearize+distance) with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, n_splits*hidden_size, hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, n_splits*[hidden_size], hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) if linearize: lin_layer1, lin_layer2 = layers.pop(0), layers.pop(0) if distance: dist_layer1, dist_layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Attention'): if self.diagonal: logits, _ = classifiers.diagonal_bilinear_attention( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.diagonal_bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if distance: with tf.variable_scope('Distance'): dist_lamda = 1+tf.nn.softplus(classifiers.diagonal_bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) else: logits, _ = classifiers.bilinear_attention( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if distance: with tf.variable_scope('Distance'): dist_lamda = 1+tf.nn.softplus(classifiers.bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) #----------------------------------------------------------- # Process the targets targets = self.placeholder shape = tf.shape(layer1) batch_size, bucket_size = shape[0], shape[1] # (1 x m) ids = tf.expand_dims(tf.range(bucket_size), 0) # (1 x m) -> (1 x 1 x m) head_ids = tf.expand_dims(ids, -2) # (1 x m) -> (1 x m x 1) dep_ids = tf.expand_dims(ids, -1) if linearize: # Wherever the head is to the left # (n x m), (1 x m) -> (n x m) lin_targets = tf.to_float(tf.less(targets, ids)) # cross-entropy of the linearization of each i,j pair # (1 x 1 x m), (1 x m x 1) -> (n x m x m) lin_ids = tf.tile(tf.less(head_ids, dep_ids), [batch_size, 1, 1]) # (n x 1 x m), (n x m x 1) -> (n x m x m) lin_xent = -tf.nn.softplus(tf.where(lin_ids, -lin_logits, lin_logits)) # add the cross-entropy to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(lin_xent) if distance: # (n x m) - (1 x m) -> (n x m) dist_targets = tf.abs(targets - ids) # KL-divergence of the distance of each i,j pair # (1 x 1 x m) - (1 x m x 1) -> (n x m x m) dist_ids = tf.to_float(tf.tile(tf.abs(head_ids - dep_ids), [batch_size, 1, 1]))+1e-12 # (n x m x m), (n x m x m) -> (n x m x m) #dist_kld = (dist_ids * tf.log(dist_lamda / dist_ids) + dist_ids - dist_lamda) dist_kld = -tf.log((dist_ids - dist_lamda)**2/2 + 1) # add the KL-divergence to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(dist_kld) #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m) + (m) -> (n x m) non_pads = tf.to_float(token_weights) + tf.to_float(tf.logical_not(tf.cast(tf.range(bucket_size), dtype=tf.bool))) # (n x m x m) o (n x 1 x m) -> (n x m x m) probabilities = tf.nn.softmax(logits) * tf.expand_dims(non_pads, -2) # (n x m), (n x m x m), (n x m) -> () loss = tf.losses.sparse_softmax_cross_entropy( targets, logits, weights=token_weights) # (n x m) -> (n x m x m x 1) one_hot_targets = tf.expand_dims(tf.one_hot(targets, bucket_size), -1) # (n x m) -> () n_tokens = tf.to_float(tf.reduce_sum(token_weights)) if linearize: # (n x m x m) -> (n x m x 1 x m) lin_xent_reshaped = tf.expand_dims(lin_xent, -2) # (n x m x 1 x m) * (n x m x m x 1) -> (n x m x 1 x 1) lin_target_xent = tf.matmul(lin_xent_reshaped, one_hot_targets) # (n x m x 1 x 1) -> (n x m) lin_target_xent = tf.squeeze(lin_target_xent, [-1, -2]) # (n x m), (n x m), (n x m) -> () loss -= tf.reduce_sum(lin_target_xent*tf.to_float(token_weights)) / (n_tokens + 1e-12) if distance: # (n x m x m) -> (n x m x 1 x m) dist_kld_reshaped = tf.expand_dims(dist_kld, -2) # (n x m x 1 x m) * (n x m x m x 1) -> (n x m x 1 x 1) dist_target_kld = tf.matmul(dist_kld_reshaped, one_hot_targets) # (n x m x 1 x 1) -> (n x m) dist_target_kld = tf.squeeze(dist_target_kld, [-1, -2]) # (n x m), (n x m), (n x m) -> () loss -= tf.reduce_sum(dist_target_kld*tf.to_float(token_weights)) / (n_tokens + 1e-12) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x m) -> (n x m) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # (n x m) (*) (n x m) -> (n x m) correct_tokens = nn.equal(targets, predictions) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = self.placeholder outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = loss outputs['loss'] = loss outputs['unlabeled_predictions'] = predictions outputs['predictions'] = predictions outputs['correct_unlabeled_tokens'] = correct_tokens outputs['n_correct_unlabeled_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_unlabeled_sequences'] = tf.reduce_sum(correct_sequences) outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_bilinear_discriminator(self, layer, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear n_splits = 2*(1+self.linearize+self.distance) with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, n_splits*self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, n_splits*[self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) if self.linearize: lin_layer1, lin_layer2 = layers.pop(0), layers.pop(0) if self.distance: dist_layer1, dist_layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Discriminator'): if self.diagonal: logits = classifiers.diagonal_bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.diagonal_bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.distance: with tf.variable_scope('Distance'): dist_lamda = 1+tf.nn.softplus(classifiers.diagonal_bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) else: logits = classifiers.bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.distance: with tf.variable_scope('Distance'): dist_lamda = 1+tf.nn.softplus(classifiers.bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) #----------------------------------------------------------- # Process the targets # (n x m x m) -> (n x m x m) unlabeled_targets = self.placeholder shape = tf.shape(layer1) batch_size, bucket_size = shape[0], shape[1] # (1 x m) ids = tf.expand_dims(tf.range(bucket_size), 0) # (1 x m) -> (1 x 1 x m) head_ids = tf.expand_dims(ids, -2) # (1 x m) -> (1 x m x 1) dep_ids = tf.expand_dims(ids, -1) if self.linearize: # Wherever the head is to the left # (n x m x m), (1 x m x 1) -> (n x m x m) lin_targets = tf.to_float(tf.less(unlabeled_targets, dep_ids)) # cross-entropy of the linearization of each i,j pair # (1 x 1 x m), (1 x m x 1) -> (n x m x m) lin_ids = tf.tile(tf.less(head_ids, dep_ids), [batch_size, 1, 1]) # (n x 1 x m), (n x m x 1) -> (n x m x m) lin_xent = -tf.nn.softplus(tf.where(lin_ids, -lin_logits, lin_logits)) # add the cross-entropy to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(lin_xent) if self.distance: # (n x m x m) - (1 x m x 1) -> (n x m x m) dist_targets = tf.abs(unlabeled_targets - dep_ids) # KL-divergence of the distance of each i,j pair # (1 x 1 x m) - (1 x m x 1) -> (n x m x m) dist_ids = tf.to_float(tf.tile(tf.abs(head_ids - dep_ids), [batch_size, 1, 1]))+1e-12 # (n x m x m), (n x m x m) -> (n x m x m) #dist_kld = (dist_ids * tf.log(dist_lamda / dist_ids) + dist_ids - dist_lamda) dist_kld = -tf.log((dist_ids - dist_lamda)**2/2 + 1) # add the KL-divergence to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(dist_kld) #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m) -> (n x m x m) probabilities = tf.nn.sigmoid(logits) * tf.to_float(token_weights) # (n x m x m), (n x m x m), (n x m x m) -> () loss = tf.losses.sigmoid_cross_entropy(unlabeled_targets, logits, weights=token_weights) n_tokens = tf.to_float(tf.reduce_sum(token_weights)) if self.linearize: lin_target_xent = lin_xent * unlabeled_targets loss -= tf.reduce_sum(lin_target_xent * tf.to_float(token_weights)) / (n_tokens + 1e-12) if self.distance: dist_target_kld = dist_kld * unlabeled_targets loss -= tf.reduce_sum(dist_target_kld * tf.to_float(token_weights)) / (n_tokens + 1e-12) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x m) -> (n x m x m) predictions = nn.greater(logits, 0, dtype=tf.int64) * token_weights # (n x m x m) (*) (n x m x m) -> (n x m x m) true_positives = predictions * unlabeled_targets # (n x m x m) -> () n_predictions = tf.reduce_sum(predictions) n_targets = tf.reduce_sum(unlabeled_targets) n_true_positives = tf.reduce_sum(true_positives) # () - () -> () n_false_positives = n_predictions - n_true_positives n_false_negatives = n_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1,2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1,2]) # (n) x 2 -> () n_correct_sequences = tf.reduce_sum(nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['unlabeled_targets'] = unlabeled_targets outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = loss outputs['loss'] = loss outputs['logits'] = logits outputs['unlabeled_predictions'] = predictions outputs['n_unlabeled_true_positives'] = n_true_positives outputs['n_unlabeled_false_positives'] = n_false_positives outputs['n_unlabeled_false_negatives'] = n_false_negatives outputs['n_correct_unlabeled_sequences'] = n_correct_sequences outputs['predictions'] = predictions outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences return outputs
def get_bilinear_classifier(self, layer, outputs, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob with tf.variable_scope(variable_scope or self.classname): with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers - 1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden( layer, 2 * hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens( layer, 2 * [hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): probabilities = [] loss = [] predictions = [] correct_tokens = [] for i, feat in enumerate(self._feats): vs_feat = str(feat).replace('[', '-RSB-').replace(']', '-LSB-') with tf.variable_scope(vs_feat): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, self.getlen(feat), hidden_keep_prob=hidden_keep_prob, add_linear=self.add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, self.getlen(feat), hidden_keep_prob=hidden_keep_prob, add_linear=self.add_linear) targets = self.placeholder[:, :, i] #--------------------------------------------------- # Compute probabilities/cross entropy # (n x m x c) -> (n x m x c) probabilities.append(tf.nn.softmax(logits)) # (n x m), (n x m x c), (n x m) -> () loss.append( tf.losses.sparse_softmax_cross_entropy( targets, logits, weights=token_weights)) #--------------------------------------------------- # Compute predictions/accuracy # (n x m x c) -> (n x m) predictions.append( tf.argmax(logits, axis=-1, output_type=tf.int32)) # (n x m) (*) (n x m) -> (n x m) correct_tokens.append( nn.equal(targets, predictions[-1])) # (n x m) x f -> (n x m x f) predictions = tf.stack(predictions, axis=-1) # (n x m) x f -> (n x m x f) correct_tokens = tf.stack(correct_tokens, axis=-1) # (n x m x f) -> (n x m) correct_tokens = tf.reduce_prod(correct_tokens, axis=-1) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['targets'] = self.placeholder outputs['probabilities'] = probabilities outputs['loss'] = tf.add_n(loss) outputs['predictions'] = predictions outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_bilinear_classifier(self, layer, outputs, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, 2*[self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) #----------------------------------------------------------- # Process the targets # (n x m x m) label_targets = self.placeholder unlabeled_predictions = outputs['unlabeled_predictions'] unlabeled_targets = outputs['unlabeled_targets'] #----------------------------------------------------------- # Process the logits # (n x m x c x m) -> (n x m x m x c) transposed_logits = tf.transpose(logits, [0,1,3,2]) #----------------------------------------------------------- # Compute the probabilities/cross entropy # (n x m x m) -> (n x m x m x 1) head_probabilities = tf.expand_dims(tf.stop_gradient(outputs['probabilities']), axis=-1) # (n x m x m x c) -> (n x m x m x c) label_probabilities = tf.nn.softmax(transposed_logits) * tf.to_float(tf.expand_dims(token_weights, axis=-1)) # (n x m x m), (n x m x m x c), (n x m x m) -> () label_loss = tf.losses.sparse_softmax_cross_entropy(label_targets, transposed_logits, weights=token_weights*unlabeled_targets) #----------------------------------------------------------- # Compute the predictions/accuracy # (n x m x m x c) -> (n x m x m) predictions = tf.argmax(transposed_logits, axis=-1, output_type=tf.int64) # (n x m x m) (*) (n x m x m) -> (n x m x m) true_positives = nn.equal(label_targets, predictions) * unlabeled_predictions correct_label_tokens = nn.equal(label_targets, predictions) * unlabeled_targets # (n x m x m) -> () n_unlabeled_predictions = tf.reduce_sum(unlabeled_predictions) n_unlabeled_targets = tf.reduce_sum(unlabeled_targets) n_true_positives = tf.reduce_sum(true_positives) n_correct_label_tokens = tf.reduce_sum(correct_label_tokens) # () - () -> () n_false_positives = n_unlabeled_predictions - n_true_positives n_false_negatives = n_unlabeled_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1,2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1,2]) n_correct_label_tokens_per_sequence = tf.reduce_sum(correct_label_tokens, axis=[1,2]) # (n) x 2 -> () n_correct_sequences = tf.reduce_sum(nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) n_correct_label_sequences = tf.reduce_sum(nn.equal(n_correct_label_tokens_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary rho = self.loss_interpolation outputs['label_targets'] = label_targets outputs['probabilities'] = label_probabilities * head_probabilities outputs['label_loss'] = label_loss outputs['loss'] = 2*((1-rho) * outputs['loss'] + rho * label_loss) outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences outputs['n_correct_label_tokens'] = n_correct_label_tokens outputs['n_correct_label_sequences'] = n_correct_label_sequences # ============================================================== outputs['label_predictions'] = predictions outputs['label_targets'] = label_targets outputs['label_logits'] = transposed_logits # pdb.set_trace() #=============================================================== return outputs
def get_bilinear_discriminator(self, layer, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top' % i): layers = classifiers.hiddens(layer, 2*[self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Discriminator'): if self.diagonal: logits = classifiers.diagonal_bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) #----------------------------------------------------------- # Process the targets # (n x m x m) -> (n x m x m) unlabeled_targets = nn.greater(self.placeholder, 0) #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m) -> (n x m x m) probabilities = tf.nn.sigmoid(logits) # (n x m x m), (n x m x m x c), (n x m x m) -> () loss = tf.losses.sigmoid_cross_entropy(unlabeled_targets, logits, weights=token_weights) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x m x c) -> (n x m x m) predictions = nn.greater(logits, 0, dtype=tf.int32) * token_weights # (n x m x m) (*) (n x m x m) -> (n x m x m) true_positives = predictions * unlabeled_targets # (n x m x m) -> () n_predictions = tf.reduce_sum(predictions) n_targets = tf.reduce_sum(unlabeled_targets) n_true_positives = tf.reduce_sum(true_positives) # () - () -> () n_false_positives = n_predictions - n_true_positives n_false_negatives = n_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1,2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1,2]) # (n) x 2 -> () n_correct_sequences = tf.reduce_sum(nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = unlabeled_targets outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = loss outputs['loss'] = loss outputs['unlabeled_predictions'] = predictions outputs['n_unlabeled_true_positives'] = n_true_positives outputs['n_unlabeled_false_positives'] = n_false_positives outputs['n_unlabeled_false_negatives'] = n_false_negatives outputs['n_correct_unlabeled_sequences'] = n_correct_sequences outputs['predictions'] = predictions outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences return outputs
def get_unfactored_bilinear_classifier(self, layer, unlabeled_targets, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob hidden_func = self.hidden_func hidden_size = self.hidden_size add_linear = self.add_linear with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*hidden_size, hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, 2*[hidden_size], hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) bucket_size = tf.shape(layer)[-2] #------------------------------------------------------- # Process the targets # c (*) (n x m) + (n x m) #targets = len(self) * unlabeled_targets + self.placeholder targets = bucket_size * self.placeholder + unlabeled_targets #------------------------------------------------------- # Process the logits # (n x m x c x m) -> (n x m x cm) reshaped_logits = tf.reshape(logits, tf.stack([-1, bucket_size, bucket_size * len(self)])) #------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x cm) -> (n x m x cm) probabilities = tf.nn.softmax(reshaped_logits) # (n x m x cm) -> (n x m x c x m) probabilities = tf.reshape(probabilities, tf.stack([-1, bucket_size, len(self), bucket_size])) # (n x m x c x m) -> (n x m x m x c) probabilities = tf.transpose(probabilities, [0,1,3,2]) # (n x m), (n x m x cm), (n x m) -> () loss = tf.losses.sparse_softmax_cross_entropy(targets, reshaped_logits, weights=token_weights) #------------------------------------------------------- # Compute predictions/accuracy # (n x m x cm) -> (n x m) predictions = tf.argmax(reshaped_logits, axis=-1, output_type=tf.int32) # (n x m), () -> (n x m) unlabeled_predictions = tf.mod(predictions, bucket_size) # (n x m) (*) (n x m) -> (n x m) correct_tokens = nn.equal(predictions, targets) * token_weights correct_unlabeled_tokens = nn.equal(unlabeled_predictions, unlabeled_targets) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) correct_unlabeled_tokens_per_sequence = tf.reduce_sum(correct_unlabeled_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) correct_unlabeled_sequences = nn.equal(tokens_per_sequence, correct_unlabeled_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = unlabeled_targets outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = tf.constant(0.) outputs['loss'] = loss outputs['unlabeled_predictions'] = unlabeled_predictions outputs['label_predictions'] = predictions outputs['n_correct_unlabeled_tokens'] = tf.reduce_sum(correct_unlabeled_tokens) outputs['n_correct_unlabeled_sequences'] = tf.reduce_sum(correct_unlabeled_sequences) outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_bilinear_classifier(self, layer, outputs, token_weights, variable_scope=None, reuse=False): """""" layer1 = layer2 = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob hidden_func = self.hidden_func hidden_size = self.hidden_size add_linear = self.add_linear with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*hidden_size, hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, 2*[hidden_size], hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) bucket_size = tf.shape(layer)[-2] #------------------------------------------------------- # Process the targets # (n x m) label_targets = self.placeholder unlabeled_predictions = outputs['unlabeled_predictions'] unlabeled_targets = outputs['unlabeled_targets'] # (n x m) -> (n x m x m) unlabeled_predictions = tf.one_hot(unlabeled_predictions, bucket_size) unlabeled_targets = tf.one_hot(unlabeled_targets, bucket_size) # (n x m x m) -> (n x m x m x 1) unlabeled_predictions = tf.expand_dims(unlabeled_predictions, axis=-1) unlabeled_targets = tf.expand_dims(unlabeled_targets, axis=-1) #------------------------------------------------------- # Process the logits # We use the gold heads for computing the label score and the predicted # heads for computing the unlabeled attachment score # (n x m x c x m) -> (n x m x m x c) transposed_logits = tf.transpose(logits, [0,1,3,2]) # (n x m x c x m) * (n x m x m x 1) -> (n x m x c x 1) predicted_logits = tf.matmul(logits, unlabeled_predictions) oracle_logits = tf.matmul(logits, unlabeled_targets) # (n x m x c x 1) -> (n x m x c) predicted_logits = tf.squeeze(predicted_logits, axis=-1) oracle_logits = tf.squeeze(oracle_logits, axis=-1) #------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m) -> (n x m x m x 1) head_probabilities = tf.expand_dims(tf.stop_gradient(outputs['probabilities']), axis=-1) # (n x m x m x c) -> (n x m x m x c) label_probabilities = tf.nn.softmax(transposed_logits) # (n x m), (n x m x c), (n x m) -> () label_loss = tf.losses.sparse_softmax_cross_entropy(label_targets, oracle_logits, weights=token_weights) #------------------------------------------------------- # Compute predictions/accuracy # (n x m x c) -> (n x m) label_predictions = tf.argmax(predicted_logits, axis=-1, output_type=tf.int32) label_oracle_predictions = tf.argmax(oracle_logits, axis=-1, output_type=tf.int32) # (n x m) (*) (n x m) -> (n x m) correct_label_tokens = nn.equal(label_targets, label_oracle_predictions) * token_weights correct_tokens = nn.equal(label_targets, label_predictions) * outputs['correct_unlabeled_tokens'] # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_label_tokens_per_sequence = tf.reduce_sum(correct_label_tokens, axis=-1) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_label_sequences = nn.equal(tokens_per_sequence, correct_label_tokens_per_sequence) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary rho = self.loss_interpolation outputs['label_targets'] = label_targets # This way we can reconstruct the head_probabilities by exponentiating and summing along the last axis outputs['probabilities'] = label_probabilities * head_probabilities outputs['label_loss'] = label_loss outputs['loss'] = 2*((1-rho) * outputs['loss'] + rho * label_loss) outputs['label_predictions'] = label_predictions outputs['n_correct_label_tokens'] = tf.reduce_sum(correct_label_tokens) outputs['n_correct_label_sequences'] = tf.reduce_sum(correct_label_sequences) outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_bilinear_discriminator(self, layer, token_weights, variable_scope=None, reuse=False, debug=False): """""" #pdb.set_trace() outputs = {} recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear n_splits = 2 * (1 + self.linearize + self.distance) with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers - 1): with tf.variable_scope('FC-%d' % i): #here is FNN? did not run layer = classifiers.hidden( layer, n_splits * self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope( 'FC-top'): #FNN output and split two layer? FNN+split layers = classifiers.hiddens(layer, n_splits * [self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop( 0 ) #layer1 and layer2 are one sentence with different word combination? layer1 head layer2 tail if self.linearize: #false lin_layer1, lin_layer2 = layers.pop(0), layers.pop(0) if self.distance: #false in graph dist_layer1, dist_layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Discriminator'): if self.diagonal: logits = classifiers.diagonal_bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.diagonal_bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.distance: with tf.variable_scope('Distance'): dist_lamda = 1 + tf.nn.softplus( classifiers.diagonal_bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) else: #only run here logits = classifiers.bilinear_discriminator( layer1, layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.linearize: with tf.variable_scope('Linearization'): lin_logits = classifiers.bilinear_discriminator( lin_layer1, lin_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) if self.distance: with tf.variable_scope('Distance'): dist_lamda = 1 + tf.nn.softplus( classifiers.bilinear_discriminator( dist_layer1, dist_layer2, hidden_keep_prob=hidden_keep_prob, add_linear=add_linear)) #----------------------------------------------------------- # Process the targets # (n x m x m) -> (n x m x m) #here in fact is a graph, which is m*m representing the connection between each edge unlabeled_targets = self.placeholder #ground truth graph, what is self.placeholder? #USELESS shape = tf.shape(layer1) batch_size, bucket_size = shape[0], shape[1] # (1 x m) ids = tf.expand_dims(tf.range(bucket_size), 0) # (1 x m) -> (1 x 1 x m) head_ids = tf.expand_dims(ids, -2) # (1 x m) -> (1 x m x 1) dep_ids = tf.expand_dims(ids, -1) #no running here if self.linearize: #So what is linearize? The linear part of bilinear? # Wherever the head is to the left # (n x m x m), (1 x m x 1) -> (n x m x m) lin_targets = tf.to_float( tf.less(unlabeled_targets, dep_ids)) # cross-entropy of the linearization of each i,j pair # (1 x 1 x m), (1 x m x 1) -> (n x m x m) lin_ids = tf.tile(tf.less(head_ids, dep_ids), [batch_size, 1, 1]) # (n x 1 x m), (n x m x 1) -> (n x m x m) lin_xent = -tf.nn.softplus( tf.where(lin_ids, -lin_logits, lin_logits)) # add the cross-entropy to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(lin_xent) if self.distance: # (n x m x m) - (1 x m x 1) -> (n x m x m) dist_targets = tf.abs(unlabeled_targets - dep_ids) # KL-divergence of the distance of each i,j pair # (1 x 1 x m) - (1 x m x 1) -> (n x m x m) dist_ids = tf.to_float( tf.tile(tf.abs(head_ids - dep_ids), [batch_size, 1, 1])) + 1e-12 # (n x m x m), (n x m x m) -> (n x m x m) #dist_kld = (dist_ids * tf.log(dist_lamda / dist_ids) + dist_ids - dist_lamda) dist_kld = -tf.log((dist_ids - dist_lamda)**2 / 2 + 1) # add the KL-divergence to the logits # (n x m x m), (n x m x m) -> (n x m x m) logits += tf.stop_gradient(dist_kld) if debug: outputs['printdata'] = {} outputs['printdata']['logits'] = logits #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m) -> (n x m x m) probabilities = tf.nn.sigmoid(logits) * tf.to_float( token_weights) #token weights is sentence length? # (n x m x m), (n x m x m), (n x m x m) -> () loss = tf.losses.sigmoid_cross_entropy( unlabeled_targets, logits, weights=token_weights ) #here label_smoothing is 0, the sigmoid XE have any effect? n_tokens = tf.to_float(tf.reduce_sum(token_weights)) if self.linearize: lin_target_xent = lin_xent * unlabeled_targets loss -= tf.reduce_sum( lin_target_xent * tf.to_float(token_weights)) / (n_tokens + 1e-12) if self.distance: dist_target_kld = dist_kld * unlabeled_targets loss -= tf.reduce_sum( dist_target_kld * tf.to_float(token_weights)) / (n_tokens + 1e-12) #----------------------------------------------------------- # Compute predictions/accuracy # precision/recall # (n x m x m) -> (n x m x m) predictions = nn.greater( logits, 0, dtype=tf.int32) * token_weights #edge that predicted # if self.compare_precision: # #pdb.set_trace() # # (n x m x m) -> (n x m) # temp_predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # # (n x m) -> (n x m x m) # cond = tf.equal(logits, tf.expand_dims(tf.reduce_max(logits,-1),-1)) # predictions = tf.where(cond, tf.cast(cond,tf.float32), tf.zeros_like(logits)) # predictions = tf.cast(predictions,tf.int32) * token_weights # # # (n x m) (*) (n x m) -> (n x m) # # n_true_positives = tf.reduce_sum(nn.equal(tf.argmax(unlabeled_targets,axis=-1, output_type=tf.int32), temp_predictions) * self.token_weights) # # n_predictions_temp = tf.reduce_sum(temp_predictions) # # n_false_positives = n_predictions_temp - n_true_positives # (n x m x m) (*) (n x m x m) -> (n x m x m) true_positives = predictions * unlabeled_targets # (n x m x m) -> () n_predictions = tf.reduce_sum(predictions) n_targets = tf.reduce_sum(unlabeled_targets) n_true_positives = tf.reduce_sum(true_positives) # () - () -> () n_false_positives = n_predictions - n_true_positives n_false_negatives = n_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1, 2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1, 2]) # (n) x 2 -> () n_correct_sequences = tf.reduce_sum( nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary outputs['unlabeled_targets'] = unlabeled_targets outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = loss outputs['loss'] = loss if debug: outputs['temp_targets'] = tf.argmax(unlabeled_targets, axis=-1, output_type=tf.int32) # outputs['temp_predictions'] = temp_predictions outputs['unlabeled_predictions'] = predictions outputs['n_unlabeled_true_positives'] = n_true_positives outputs['n_unlabeled_false_positives'] = n_false_positives outputs['n_unlabeled_false_negatives'] = n_false_negatives outputs['n_correct_unlabeled_sequences'] = n_correct_sequences outputs['predictions'] = predictions outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences return outputs