Example #1
0
    def build(self, input_shape):
        if not isinstance(input_shape, list) or len(input_shape) != 2:
            raise ValueError(
                'A `SelfMultiHeadAttention` layer should be called '
                'on a list of 2 tensors')
        if len(input_shape[0]) != 3 or len(input_shape[1]) != 2:
            raise ValueError(
                'input: [N, T_k, d_model], key masks: [N, key_seqlen]')
        embedding_size = int(input_shape[0][-1])
        if self.num_units == None:
            self.num_units = embedding_size
        self.W = self.add_weight(name='Q_K_V',
                                 shape=[embedding_size, self.num_units * 3],
                                 dtype=tf.float32,
                                 initializer=TruncatedNormal(seed=self.seed))
        self.W_output = self.add_weight(
            name='output_W',
            shape=[self.num_units, self.num_units],
            dtype=tf.float32,
            initializer=TruncatedNormal(seed=self.seed))

        self.layer_norm = LayerNormalization()
        self.attention = DotAttention(scale=self.scale)
        self.softmax_weight_sum = SoftmaxWeightedSum(
            dropout_rate=self.dropout_rate,
            future_binding=self.future_binding,
            seed=self.seed)
        self.dropout = Dropout(self.dropout_rate, seed=self.seed)
        self.seq_len_max = int(input_shape[0][1])
        # Be sure to call this somewhere!
        super(SelfMultiHeadAttention, self).build(input_shape)
Example #2
0
    def build(self, input_shape):
        # Create a trainable weight variable for this layer.

        if self.sess_max_count == 1:
            embed_size = input_shape[2].value
            seq_len_max = input_shape[1].value
        else:
            embed_size = input_shape[0][2].value
            seq_len_max = input_shape[0][1].value

        self.sess_bias_embedding = self.add_weight(
            'sess_bias_embedding',
            shape=(self.sess_max_count, 1, 1),
            initializer=TruncatedNormal(mean=0.0,
                                        stddev=0.0001,
                                        seed=self.seed))
        self.seq_bias_embedding = self.add_weight('seq_bias_embedding',
                                                  shape=(1, seq_len_max, 1),
                                                  initializer=TruncatedNormal(
                                                      mean=0.0,
                                                      stddev=0.0001,
                                                      seed=self.seed))
        self.item_bias_embedding = self.add_weight('item_bias_embedding',
                                                   shape=(1, 1, embed_size),
                                                   initializer=TruncatedNormal(
                                                       mean=0.0,
                                                       stddev=0.0001,
                                                       seed=self.seed))

        # Be sure to call this somewhere!
        super(BiasEncoding, self).build(input_shape)
Example #3
0
def get_embedding(region_num, region_feature_dim_dict, base_feature_dim_dict,
                  bias_feature_dim_dict, init_std, seed, l2_reg_linear):
    region_embeddings = [[
        Embedding(feat.dimension,
                  1,
                  embeddings_initializer=TruncatedNormal(stddev=init_std,
                                                         seed=seed + j),
                  embeddings_regularizer=l2(l2_reg_linear),
                  name='region_emb_' + str(j) + '_' + str(i))
        for i, feat in enumerate(region_feature_dim_dict['sparse'])
    ] for j in range(region_num)]
    base_embeddings = [[
        Embedding(feat.dimension,
                  1,
                  embeddings_initializer=TruncatedNormal(stddev=init_std,
                                                         seed=seed + j),
                  embeddings_regularizer=l2(l2_reg_linear),
                  name='base_emb_' + str(j) + '_' + str(i))
        for i, feat in enumerate(base_feature_dim_dict['sparse'])
    ] for j in range(region_num)]
    bias_embedding = [
        Embedding(feat.dimension,
                  1,
                  embeddings_initializer=TruncatedNormal(stddev=init_std,
                                                         seed=seed),
                  embeddings_regularizer=l2(l2_reg_linear),
                  name='embed_bias' + '_' + str(i))
        for i, feat in enumerate(bias_feature_dim_dict['sparse'])
    ]

    return region_embeddings, base_embeddings, bias_embedding
 def build(self, input_shape):
     embedding_size = int(input_shape[0][-1])
     # wq_wk_wv 放到一块
     self.W = self.add_weight(name='Q_K_V', shape=[embedding_size, self.num_units*3],
                              dtype=tf.float32,
                              initializer=TruncatedNormal(seed=self.seed))
     self.W_output = self.add_weight(name='output_W', shape=[self.num_units, self.num_units],
                                    dtype=tf.float32,
                                     initializer=TruncatedNormal(seed=self.seed))
     self.layer_norm = LayerNormalization()
     self.dropout = Dropout(self.dropout_rate, seed=self.seed)
     self.seq_len_max = int(input_shape[0][1])
     super(MultiHeadAttention, self).build(input_shape)
Example #5
0
 def __init__(
         self,
         kind: str,
         n_units,
         n_layers=1,
         # Its not obvious how to compute fan_in/fan_out for these models
         # so we recommend avoiding glorot initialization for now
         w_init=TruncatedNormal(stddev=0.05),
         recurrent_init=None,
         bidirectional=True,
         learn_initial_states: bool = False,
         lstm_bias=1,
         keep_recurrent: float = 1):
     if bidirectional is None or n_layers is None or n_units is None:
         raise ValueError()
     if kind not in ["GRU", "LSTM"]:
         raise ValueError()
     self._kind = kind
     self.keep_recurrent = keep_recurrent
     self.lstm_bias = lstm_bias
     self.n_units = n_units
     self.n_layers = n_layers
     self.bidirectional = bidirectional
     self.w_init = w_init
     self.recurrent_init = recurrent_init
     self.learn_initial_states = learn_initial_states
def get_initializer(initializer_params):
    if initializer_params["function"] == "truncated_normal":
        return TruncatedNormal(stddev=initializer_params["stddev"])
    elif initializer_params["function"] == "constant":
        return Constant(value=initializer_params["value"])
    else:
        return initializer_params["function"]
def mvm(embeddings, factor_size):
    num_features = int(embeddings.shape.dims[1])
    bias = tf.get_variable("padding_bias", (num_features, factor_size), initializer=TruncatedNormal(stddev=0.02))
    all_order = tf.add(embeddings, bias)
    out = all_order[:, 0, :]  # B x 1 x factor_size
    for i in range(1, num_features):
        out = tf.multiply(out, all_order[:, i, :])
    out = tf.reshape(out, shape=[-1, factor_size])
    return out
Example #8
0
 def __init__(self,
              n_units,
              n_layers=1,
              lstm_bias=1,
              w_init=TruncatedNormal(stddev=0.05),
              recurrent_init=None,
              bidirectional=True,
              learn_initial_states=False):
     super().__init__("LSTM", n_units, n_layers, w_init, recurrent_init,
                      bidirectional, learn_initial_states, lstm_bias)
Example #9
0
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

img_size = 28
img_size_flat = 784
img_shape = [28, 28]
img_shape_full = [28, 28, 1]
n_classes = 10
num_channels = 1

n_layers = 2
n_neurones = []
n_neurones.extend([484] * n_layers)
n_neurones.append(n_classes)

ini = TruncatedNormal(mean=0.0, stddev=0.1, seed=None)
optimizer = Adam(lr=1e-3)

model = Sequential()
model.add(InputLayer(input_shape=(img_size_flat, )))

for i in range(n_layers - 1):
    model.add(
        Dense(n_neurones[i],
              kernel_initializer=ini,
              bias_initializer=ini,
              activation='relu'))

model.add(Reshape([22, 22, 1]))
model.add(
    Conv2D(kernel_size=5,
Example #10
0
def main():
    parser = argparse.ArgumentParser("Train our ELMo model on SQuAD")
    parser.add_argument("output_dir")
    parser.add_argument("--dim", type=int, default=90)
    parser.add_argument("--l2", type=float, default=0)
    parser.add_argument("--mode", choices=["input", "output", "both", "none"], default="both")
    parser.add_argument("--top_layer_only", action="store_true")
    args = parser.parse_args()

    out = args.output_dir + "-" + datetime.now().strftime("%m%d-%H%M%S")

    dim = args.dim
    recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05))

    params = trainer.TrainParams(trainer.SerializableOptimizer("Adadelta", dict(learning_rate=1.0)),
                                 ema=0.999, max_checkpoints_to_keep=2, async_encoding=10,
                                 num_epochs=24, log_period=30, eval_period=1200, save_period=1200,
                                 best_weights=("dev", "b17/text-f1"),
                                 eval_samples=dict(dev=None, train=8000))

    lm_reduce = MapperSeq(
        ElmoLayer(args.l2, layer_norm=False, top_layer_only=args.top_layer_only),
        DropoutLayer(0.5),
    )

    model = AttentionWithElmo(
        encoder=DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()),
        lm_model=SquadContextConcatSkip(),
        append_before_atten=(args.mode == "both" or args.mode == "output"),
        append_embed=(args.mode == "both" or args.mode == "input"),
        max_batch_size=128,
        word_embed=FixedWordEmbedder(vec_name="glove.840B.300d", word_vec_init_scale=0, learn_unk=False, cpu=True),
        char_embed=CharWordEmbedder(
            LearnedCharEmbedder(word_size_th=14, char_th=49, char_dim=20, init_scale=0.05, force_cpu=True),
            MaxPool(Conv1d(100, 5, 0.8)),
            shared_parameters=True
        ),
        embed_mapper=SequenceMapperSeq(
            VariationalDropoutLayer(0.8),
            recurrent_layer,
            VariationalDropoutLayer(0.8),
        ),
        lm_reduce=None,
        lm_reduce_shared=lm_reduce,
        per_sentence=False,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=SequenceMapperSeq(FullyConnected(dim * 2, activation="relu"),
                                        ResidualLayer(SequenceMapperSeq(
                                            VariationalDropoutLayer(0.8),
                                            recurrent_layer,
                                            VariationalDropoutLayer(0.8),
                                            StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()),
                                            FullyConnected(dim * 2, activation="relu"),
                                        )),
                                        VariationalDropoutLayer(0.8)),
        predictor = BoundsPredictor(ChainBiMapper(
            first_layer=recurrent_layer,
            second_layer=recurrent_layer
        ))
    )

    batcher = ClusteredBatcher(45, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(SquadCorpus(), None, batcher, batcher)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = str(sorted(args.__dict__.items(), key=lambda x:x[0])) + "\n" + notes

    trainer.start_training(data, model, params,
                           [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")],
                           ModelDir(out), notes)