def __init__(self, name, dep_reprs, head_reprs, roll_direction=0): self.name = name with tf.variable_scope(name + '/predictions'): # apply hidden layers to the input representations arc_dep_hidden = model_helpers.project( dep_reprs, config.projection_size, 'arc_dep_hidden') arc_head_hidden = model_helpers.project( head_reprs, config.projection_size, 'arc_head_hidden') arc_dep_hidden = tf.nn.relu(arc_dep_hidden) arc_head_hidden = tf.nn.relu(arc_head_hidden) arc_head_hidden = tf.nn.dropout(arc_head_hidden, inputs.keep_prob) arc_dep_hidden = tf.nn.dropout(arc_dep_hidden, inputs.keep_prob) # bilinear classifier excluding the final dot product arc_head = tf.layers.dense( arc_head_hidden, config.depparse_projection_size, name='arc_head') W = tf.get_variable('shared_W', shape=[config.projection_size, n_classes, config.depparse_projection_size]) Wr = tf.get_variable('relation_specific_W', shape=[config.projection_size, config.depparse_projection_size]) Wr_proj = tf.tile(tf.expand_dims(Wr, axis=-2), [1, n_classes, 1]) W += Wr_proj arc_dep = tf.tensordot(arc_dep_hidden, W, axes=[[-1], [0]]) shape = tf.shape(arc_dep) arc_dep = tf.reshape(arc_dep, [shape[0], -1, config.depparse_projection_size]) # apply the transformer scaling trick to prevent dot products from # getting too large (possibly not necessary) scale = np.power( config.depparse_projection_size, 0.25).astype('float32') scale = tf.get_variable('scale', initializer=scale, dtype=tf.float32) arc_dep /= scale arc_head /= scale # compute the scores for each candidate arc word_scores = tf.matmul(arc_head, arc_dep, transpose_b=True) root_scores = tf.layers.dense(arc_head, n_classes, name='root_score') arc_scores = tf.concat([root_scores, word_scores], axis=-1) # disallow the model from making impossible predictions mask = inputs.mask mask_shape = tf.shape(mask) mask = tf.tile(tf.expand_dims(mask, -1), [1, 1, n_classes]) mask = tf.reshape(mask, [-1, mask_shape[1] * n_classes]) mask = tf.concat([tf.ones((mask_shape[0], 1)), tf.zeros((mask_shape[0], n_classes - 1)), mask], axis=1) mask = tf.tile(tf.expand_dims(mask, 1), [1, mask_shape[1], 1]) arc_scores += (mask - 1) * 100.0 self.logits = arc_scores self.loss = model_helpers.masked_ce_loss( self.logits, labels, inputs.mask, roll_direction=roll_direction)
def __init__(self, name, input_reprs, roll_direction=0, activate=True): self.name = name with tf.variable_scope(name + '/predictions'): projected = model_helpers.project(input_reprs, config.projection_size) if activate: projected = tf.nn.relu(projected) self.logits = tf.layers.dense(projected, n_classes, name='predict') targets = labels targets *= (1 - inputs.label_smoothing) targets += inputs.label_smoothing / n_classes self.loss = model_helpers.masked_ce_loss( self.logits, targets, inputs.mask, roll_direction=roll_direction)