Exemple #1
0
def encoder(source, params):
    mask = dtype.tf_to_float(tf.cast(source, tf.bool))
    hidden_size = params.hidden_size
    initializer = tf.random_normal_initializer(0.0, hidden_size**-0.5)

    source, mask = util.remove_invalid_seq(source, mask)

    embed_name = "embedding" if params.shared_source_target_embedding \
        else "src_embedding"
    src_emb = tf.get_variable(embed_name,
                              [params.src_vocab.size(), params.embed_size],
                              initializer=initializer)
    src_bias = tf.get_variable("bias", [params.embed_size])

    inputs = tf.gather(src_emb, source) * (hidden_size**0.5)
    inputs = tf.nn.bias_add(inputs, src_bias)
    inputs = func.add_timing_signal(inputs)

    inputs = util.valid_apply_dropout(inputs, params.dropout)

    with tf.variable_scope("encoder"):
        x = inputs
        for layer in range(params.num_encoder_layer):
            if params.deep_transformer_init:
                layer_initializer = tf.variance_scaling_initializer(
                    params.initializer_gain * (layer + 1)**-0.5,
                    mode="fan_avg",
                    distribution="uniform")
            else:
                layer_initializer = None
            with tf.variable_scope("layer_{}".format(layer),
                                   initializer=layer_initializer):
                with tf.variable_scope("self_attention"):
                    y = func.dot_attention(x,
                                           None,
                                           func.attention_bias(
                                               mask, "masking"),
                                           hidden_size,
                                           num_heads=params.num_heads,
                                           dropout=params.attention_dropout)

                    y = y['output']
                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)

                with tf.variable_scope("feed_forward"):
                    y = func.ffn_layer(
                        x,
                        params.filter_size,
                        hidden_size,
                        dropout=params.relu_dropout,
                    )

                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)

    source_encodes = x
    x_shp = util.shape_list(x)

    return {
        "encodes": source_encodes,
        "decoder_initializer": {
            "layer_{}".format(l): {
                # plan aan
                "aan": dtype.tf_to_float(tf.zeros([x_shp[0], 1, hidden_size])),
            }
            for l in range(params.num_decoder_layer)
        },
        "mask": mask
    }
Exemple #2
0
def decoder(target, state, params):
    mask = dtype.tf_to_float(tf.cast(target, tf.bool))
    hidden_size = params.hidden_size
    initializer = tf.random_normal_initializer(0.0, hidden_size**-0.5)

    is_training = ('decoder' not in state)

    if is_training:
        target, mask = util.remove_invalid_seq(target, mask)

    embed_name = "embedding" if params.shared_source_target_embedding \
        else "tgt_embedding"
    tgt_emb = tf.get_variable(embed_name,
                              [params.tgt_vocab.size(), params.embed_size],
                              initializer=initializer)
    tgt_bias = tf.get_variable("bias", [params.embed_size])

    inputs = tf.gather(tgt_emb, target) * (hidden_size**0.5)
    inputs = tf.nn.bias_add(inputs, tgt_bias)

    # shift
    if is_training:
        inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]])
        inputs = inputs[:, :-1, :]
        inputs = func.add_timing_signal(inputs)
    else:
        inputs = tf.cond(
            tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())),
            lambda: tf.zeros_like(inputs), lambda: inputs)
        mask = tf.ones_like(mask)
        inputs = func.add_timing_signal(inputs,
                                        time=dtype.tf_to_float(state['time']))

    inputs = util.valid_apply_dropout(inputs, params.dropout)

    with tf.variable_scope("decoder"):
        x = inputs
        for layer in range(params.num_decoder_layer):
            if params.deep_transformer_init:
                layer_initializer = tf.variance_scaling_initializer(
                    params.initializer_gain * (layer + 1)**-0.5,
                    mode="fan_avg",
                    distribution="uniform")
            else:
                layer_initializer = None
            with tf.variable_scope("layer_{}".format(layer),
                                   initializer=layer_initializer):
                with tf.variable_scope("average_attention"):
                    x_fwds = []
                    for strategy in params.strategies:
                        with tf.variable_scope(strategy):
                            x_fwd = average_attention_strategy(
                                strategy, x, mask, state, layer, params)
                            x_fwds.append(x_fwd)
                    x_fwd = tf.add_n(x_fwds) / len(x_fwds)

                    # FFN activation
                    if params.use_ffn:
                        y = func.ffn_layer(
                            x_fwd,
                            params.filter_size,
                            hidden_size,
                            dropout=params.relu_dropout,
                        )
                    else:
                        y = x_fwd

                    # Gating layer
                    z = func.linear(tf.concat([x, y], axis=-1),
                                    hidden_size * 2,
                                    scope="z_project")
                    i, f = tf.split(z, 2, axis=-1)
                    y = tf.sigmoid(i) * x + tf.sigmoid(f) * y

                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)

                with tf.variable_scope("cross_attention"):
                    y = func.dot_attention(
                        x,
                        state['encodes'],
                        func.attention_bias(state['mask'], "masking"),
                        hidden_size,
                        num_heads=params.num_heads,
                        dropout=params.attention_dropout,
                        cache=None if is_training else
                        state['decoder']['state']['layer_{}'.format(layer)])
                    if not is_training:
                        # mk, mv
                        state['decoder']['state']['layer_{}'.format(layer)]\
                            .update(y['cache'])

                    y = y['output']
                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)

                with tf.variable_scope("feed_forward"):
                    y = func.ffn_layer(
                        x,
                        params.filter_size,
                        hidden_size,
                        dropout=params.relu_dropout,
                    )

                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)
    feature = x
    if 'dev_decode' in state:
        feature = x[:, -1, :]

    embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \
        else "softmax_embedding"
    embed_name = "embedding" if params.shared_source_target_embedding \
        else embed_name
    softmax_emb = tf.get_variable(embed_name,
                                  [params.tgt_vocab.size(), params.embed_size],
                                  initializer=initializer)
    feature = tf.reshape(feature, [-1, params.embed_size])
    logits = tf.matmul(feature, softmax_emb, False, True)

    logits = tf.cast(logits, tf.float32)

    soft_label, normalizer = util.label_smooth(target,
                                               util.shape_list(logits)[-1],
                                               factor=params.label_smooth)
    centropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                          labels=soft_label)
    centropy -= normalizer
    centropy = tf.reshape(centropy, tf.shape(target))

    mask = tf.cast(mask, tf.float32)
    per_sample_loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum(
        mask, -1)
    loss = tf.reduce_mean(per_sample_loss)

    # these mask tricks mainly used to deal with zero shapes, such as [0, 1]
    loss = tf.cond(tf.equal(tf.shape(target)[0], 0),
                   lambda: tf.constant(0, dtype=tf.float32), lambda: loss)

    return loss, logits, state, per_sample_loss
Exemple #3
0
def encoder(source, params):
    mask = tf.to_float(tf.cast(source, tf.bool))
    hidden_size = params.hidden_size

    source, mask = util.remove_invalid_seq(source, mask)

    embed_name = "embedding" if params.shared_source_target_embedding \
        else "src_embedding"
    src_emb = tf.get_variable(embed_name,
                              [params.src_vocab.size(), params.embed_size])
    src_bias = tf.get_variable("bias", [params.embed_size])

    inputs = tf.gather(src_emb, source)
    inputs = tf.nn.bias_add(inputs, src_bias)

    if util.valid_dropout(params.dropout):
        inputs = tf.nn.dropout(inputs, 1. - params.dropout)

    with tf.variable_scope("encoder"):
        x = inputs

        for layer in range(params.num_encoder_layer):
            with tf.variable_scope("layer_{}".format(layer)):
                # forward rnn
                with tf.variable_scope('forward'):
                    outputs = rnn.rnn(params.cell,
                                      x,
                                      hidden_size,
                                      mask=mask,
                                      ln=params.layer_norm,
                                      sm=params.swap_memory,
                                      dp=params.dropout)
                    output_fw, state_fw = outputs[1]
                if layer == 0:
                    # backward rnn
                    with tf.variable_scope('backward'):
                        if not params.caencoder:
                            outputs = rnn.rnn(params.cell,
                                              tf.reverse(x, [1]),
                                              hidden_size,
                                              mask=tf.reverse(mask, [1]),
                                              ln=params.layer_norm,
                                              sm=params.swap_memory,
                                              dp=params.dropout)
                            output_bw, state_bw = outputs[1]
                        else:
                            outputs = rnn.cond_rnn(params.cell,
                                                   tf.reverse(x, [1]),
                                                   tf.reverse(output_fw, [1]),
                                                   hidden_size,
                                                   mask=tf.reverse(mask, [1]),
                                                   ln=params.layer_norm,
                                                   sm=params.swap_memory,
                                                   num_heads=params.num_heads,
                                                   one2one=True)
                            output_bw, state_bw = outputs[1]

                        output_bw = tf.reverse(output_bw, [1])

                    if not params.caencoder:
                        y = tf.concat([output_fw, output_bw], -1)
                        z = tf.concat([state_fw, state_bw], -1)
                    else:
                        y = output_bw
                        z = state_bw
                else:
                    y = output_fw
                    z = state_fw

                y = func.linear(y, hidden_size, ln=False, scope="ff")

                # short cut via residual connection
                if x.get_shape()[-1].value == y.get_shape()[-1].value:
                    x = func.residual_fn(x, y, dropout=params.dropout)
                else:
                    x = y
                if params.layer_norm:
                    x = func.layer_norm(x, scope="ln")

    with tf.variable_scope("decoder_initializer"):
        decoder_cell = rnn.get_cell(params.cell,
                                    hidden_size,
                                    ln=params.layer_norm)

    return {
        "encodes": x,
        "decoder_initializer": {
            "layer_{}".format(l):
            decoder_cell.get_init_state(x=z, scope="layer_{}".format(l))
            for l in range(params.num_decoder_layer)
        },
        "mask": mask
    }
def decoder(target, state, params):
    mask = dtype.tf_to_float(tf.cast(target, tf.bool))
    hidden_size = params.hidden_size
    initializer = tf.random_normal_initializer(0.0, hidden_size**-0.5)

    is_training = ('decoder' not in state)

    if is_training:
        target, mask = util.remove_invalid_seq(target, mask)

    embed_name = "embedding" if params.shared_source_target_embedding \
        else "tgt_embedding"
    tgt_emb = tf.get_variable(embed_name,
                              [params.tgt_vocab.size(), params.embed_size],
                              initializer=initializer)
    tgt_bias = tf.get_variable("bias", [params.embed_size])

    inputs = tf.gather(tgt_emb, target) * (hidden_size**0.5)
    inputs = tf.nn.bias_add(inputs, tgt_bias)

    # shift
    if is_training:
        inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]])
        inputs = inputs[:, :-1, :]
        inputs = func.add_timing_signal(inputs)
    else:
        inputs = tf.cond(
            tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())),
            lambda: tf.zeros_like(inputs), lambda: inputs)
        mask = tf.ones_like(mask)
        inputs = func.add_timing_signal(inputs,
                                        time=dtype.tf_to_float(state['time']))

    inputs = util.valid_apply_dropout(inputs, params.dropout)

    # Applying L0Drop
    # --------
    source_memory = state["encodes"]
    source_mask = state["mask"]

    # source_pruning: log alpha_i = x_i w^T
    source_pruning = func.linear(source_memory, 1, scope="source_pruning")

    if is_training:  # training
        source_memory, l0_mask = l0norm.var_train(
            (source_memory, source_pruning))
        l0_norm_loss = tf.squeeze(l0norm.l0_norm(source_pruning), -1)
        l0_norm_loss = tf.reduce_sum(l0_norm_loss * source_mask,
                                     -1) / tf.reduce_sum(source_mask, -1)
        l0_norm_loss = tf.reduce_mean(l0_norm_loss)
        l0_norm_loss = l0norm.l0_regularization_loss(
            l0_norm_loss,
            reg_scalar=params.l0_norm_reg_scalar,
            start_reg_ramp_up=params.l0_norm_start_reg_ramp_up,
            end_reg_ramp_up=params.l0_norm_end_reg_ramp_up,
            warm_up=params.l0_norm_warm_up,
        )

        # force the model to only attend to unmasked position
        source_mask = dtype.tf_to_float(
            tf.cast(tf.squeeze(l0_mask, -1), tf.bool)) * source_mask
    else:  # evaluation
        source_memory, l0_mask = l0norm.var_eval(
            (source_memory, source_pruning))
        l0_norm_loss = 0.0

        source_memory, source_mask, count_mask = extract_encodes(
            source_memory, source_mask, l0_mask)
        count_mask = tf.expand_dims(tf.expand_dims(count_mask, 1), 1)
    # --------

    with tf.variable_scope("decoder"):
        x = inputs
        for layer in range(params.num_decoder_layer):
            if params.deep_transformer_init:
                layer_initializer = tf.variance_scaling_initializer(
                    params.initializer_gain * (layer + 1)**-0.5,
                    mode="fan_avg",
                    distribution="uniform")
            else:
                layer_initializer = None
            with tf.variable_scope("layer_{}".format(layer),
                                   initializer=layer_initializer):
                with tf.variable_scope("self_attention"):
                    y = func.dot_attention(
                        x,
                        None,
                        func.attention_bias(tf.shape(mask)[1], "causal"),
                        hidden_size,
                        num_heads=params.num_heads,
                        dropout=params.attention_dropout,
                        cache=None if is_training else
                        state['decoder']['state']['layer_{}'.format(layer)])
                    if not is_training:
                        # k, v
                        state['decoder']['state']['layer_{}'.format(layer)] \
                            .update(y['cache'])

                    y = y['output']
                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)

                with tf.variable_scope("cross_attention"):
                    if is_training:
                        y = func.dot_attention(
                            x,
                            source_memory,
                            func.attention_bias(source_mask, "masking"),
                            hidden_size,
                            num_heads=params.num_heads,
                            dropout=params.attention_dropout,
                        )
                    else:
                        y = dot_attention(x,
                                          source_memory,
                                          func.attention_bias(
                                              source_mask, "masking"),
                                          hidden_size,
                                          count_mask=count_mask,
                                          num_heads=params.num_heads,
                                          dropout=params.attention_dropout,
                                          cache=state['decoder']['state'][
                                              'layer_{}'.format(layer)])

                        # mk, mv
                        state['decoder']['state']['layer_{}'.format(layer)] \
                            .update(y['cache'])

                    y = y['output']
                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)

                with tf.variable_scope("feed_forward"):
                    y = func.ffn_layer(
                        x,
                        params.filter_size,
                        hidden_size,
                        dropout=params.relu_dropout,
                    )

                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)
    feature = x
    if 'dev_decode' in state:
        feature = x[:, -1, :]

    embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \
        else "softmax_embedding"
    embed_name = "embedding" if params.shared_source_target_embedding \
        else embed_name
    softmax_emb = tf.get_variable(embed_name,
                                  [params.tgt_vocab.size(), params.embed_size],
                                  initializer=initializer)
    feature = tf.reshape(feature, [-1, params.embed_size])
    logits = tf.matmul(feature, softmax_emb, False, True)

    logits = tf.cast(logits, tf.float32)

    soft_label, normalizer = util.label_smooth(target,
                                               util.shape_list(logits)[-1],
                                               factor=params.label_smooth)
    centropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                          labels=soft_label)
    centropy -= normalizer
    centropy = tf.reshape(centropy, tf.shape(target))

    mask = tf.cast(mask, tf.float32)
    per_sample_loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum(
        mask, -1)
    loss = tf.reduce_mean(per_sample_loss)

    loss = loss + l0_norm_loss

    # these mask tricks mainly used to deal with zero shapes, such as [0, 1]
    loss = tf.cond(tf.equal(tf.shape(target)[0], 0),
                   lambda: tf.constant(0, tf.float32), lambda: loss)

    return loss, logits, state, per_sample_loss
Exemple #5
0
def decoder(target, state, params):
    mask = tf.to_float(tf.cast(target, tf.bool))
    hidden_size = params.hidden_size

    if 'decoder' not in state:
        target, mask = util.remove_invalid_seq(target, mask)

    embed_name = "embedding" if params.shared_source_target_embedding \
        else "tgt_embedding"
    tgt_emb = tf.get_variable(embed_name,
                              [params.tgt_vocab.size(), params.embed_size])
    tgt_bias = tf.get_variable("bias", [params.embed_size])

    inputs = tf.gather(tgt_emb, target)
    inputs = tf.nn.bias_add(inputs, tgt_bias)

    # shift
    if 'decoder' not in state:
        inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]])
        inputs = inputs[:, :-1, :]
    else:
        inputs = tf.cond(
            tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())),
            lambda: tf.zeros_like(inputs), lambda: inputs)
        mask = tf.ones_like(mask)

    if util.valid_dropout(params.dropout):
        inputs = tf.nn.dropout(inputs, 1. - params.dropout)

    with tf.variable_scope("decoder"):
        x = inputs
        for layer in range(params.num_decoder_layer):
            with tf.variable_scope("layer_{}".format(layer)):
                init_state = state["decoder_initializer"]["layer_{}".format(
                    layer)]
                if 'decoder' in state:
                    init_state = state["decoder"]["state"]["layer_{}".format(
                        layer)]
                if layer == 0 or params.use_deep_att:
                    returns = rnn.cond_rnn(params.cell,
                                           x,
                                           state["encodes"],
                                           hidden_size,
                                           init_state=init_state,
                                           mask=mask,
                                           num_heads=params.num_heads,
                                           mem_mask=state["mask"],
                                           ln=params.layer_norm,
                                           sm=params.swap_memory,
                                           one2one=False,
                                           dp=params.dropout)
                    (_, hidden_state), (outputs,
                                        _), contexts, attentions = returns
                    c = contexts
                else:
                    if params.caencoder:
                        returns = rnn.cond_rnn(params.cell,
                                               x,
                                               c,
                                               hidden_size,
                                               init_state=init_state,
                                               mask=mask,
                                               mem_mask=mask,
                                               ln=params.layer_norm,
                                               sm=params.swap_memory,
                                               num_heads=params.num_heads,
                                               one2one=True,
                                               dp=params.dropout)
                        (_, hidden_state), (outputs,
                                            _), contexts, attentions = returns
                    else:
                        outputs = rnn.rnn(params.cell,
                                          tf.concat([x, c], -1),
                                          hidden_size,
                                          mask=mask,
                                          init_state=init_state,
                                          ln=params.layer_norm,
                                          sm=params.swap_memory,
                                          dp=params.dropout)
                        outputs, hidden_state = outputs[1]
                if 'decoder' in state:
                    state['decoder']['state']['layer_{}'.format(
                        layer)] = hidden_state

                y = func.linear(outputs, hidden_size, ln=False, scope="ff")

                # short cut via residual connection
                if x.get_shape()[-1].value == y.get_shape()[-1].value:
                    x = func.residual_fn(x, y, dropout=params.dropout)
                else:
                    x = y
                if params.layer_norm:
                    x = func.layer_norm(x, scope="ln")

    feature = func.linear(tf.concat([x, c], -1),
                          params.embed_size,
                          ln=params.layer_norm,
                          scope="ff")
    feature = tf.nn.tanh(feature)

    if util.valid_dropout(params.dropout):
        feature = tf.nn.dropout(feature, 1. - params.dropout)

    if 'dev_decode' in state:
        feature = x[:, -1, :]

    embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \
        else "softmax_embedding"
    embed_name = "embedding" if params.shared_source_target_embedding \
        else embed_name
    softmax_emb = tf.get_variable(embed_name,
                                  [params.tgt_vocab.size(), params.embed_size])
    feature = tf.reshape(feature, [-1, params.embed_size])
    logits = tf.matmul(feature, softmax_emb, False, True)

    soft_label, normalizer = util.label_smooth(target,
                                               util.shape_list(logits)[-1],
                                               factor=params.label_smooth)
    centropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                          labels=soft_label)
    centropy -= normalizer
    centropy = tf.reshape(centropy, tf.shape(target))

    loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum(mask, -1)
    loss = tf.reduce_mean(loss)

    # these mask tricks mainly used to deal with zero shapes, such as [0, 1]
    loss = tf.cond(tf.equal(tf.shape(target)[0], 0),
                   lambda: tf.constant(0, dtype=tf.float32), lambda: loss)

    return loss, logits, state