Exemple #1
0
def load_google_bert(
        base_location:
    str = './google_bert/downloads/multilingual_L-12_H-768_A-12/',
        use_attn_mask: bool = True,
        max_len: int = 512) -> keras.Model:
    bert_config = BertConfig.from_json_file(base_location + 'bert_config.json')
    assert bert_config.max_position_embeddings >= max_len, 'Maximal sequence length exceeds the length of the pretrained position embeddings.'
    init_checkpoint = base_location + 'bert_model.ckpt'
    var_names = tf.train.list_variables(init_checkpoint)
    check_point = tf.train.load_checkpoint(init_checkpoint)
    model, transformer_out_layers, attention_layers = create_transformer(
        embedding_layer_norm=True,
        neg_inf=-10000.0,
        use_attn_mask=use_attn_mask,
        vocab_size=bert_config.vocab_size,
        accurate_gelu=True,
        layer_norm_epsilon=1e-12,
        max_len=max_len,
        max_position_embeddings=bert_config.max_position_embeddings,
        use_one_embedding_dropout=True,
        d_hid=bert_config.intermediate_size,
        embedding_dim=bert_config.hidden_size,
        num_layers=bert_config.num_hidden_layers,
        num_heads=bert_config.num_attention_heads,
        residual_dropout=bert_config.hidden_dropout_prob,
        attention_dropout=bert_config.attention_probs_dropout_prob)
    weights = get_bert_weights_for_keras_model(check_point, model, var_names)
    model.set_weights(weights)
    return model, transformer_out_layers, attention_layers
Exemple #2
0
 def create_small_model(self, use_attn_mask: bool):
     return create_transformer(vocab_size=self.vocab_size,
                               num_heads=self.num_heads,
                               num_layers=self.num_layers,
                               embedding_dim=self.embedding_dim,
                               d_hid=self.d_hid,
                               max_len=self.max_len,
                               use_attn_mask=use_attn_mask)
Exemple #3
0
def load_openai_transformer(path: str = './openai/model/', use_attn_mask: bool = True,
                            use_one_embedding_dropout: bool = False, max_len: int = 512) -> keras.Model:
    with open(path + 'params_shapes.json') as f:
        shapes = json.load(f)
    offsets = np.cumsum([np.prod(shape) for shape in shapes])
    init_params = [np.load(path + 'params_{}.npy'.format(n)) for n in range(10)]
    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
    init_params[0] = init_params[0][:min(512, max_len)]
    # add special token embedding to token embedding
    init_params[1] = np.concatenate(
        (init_params[1], np.random.randn(TextEncoder.SPECIAL_COUNT, 768).astype(np.float32) * 0.02), axis=0)
    init_params = [np.zeros((TextEncoder.NUM_SEGMENTS, 768)).astype(np.float32)] + init_params  # segment embedding
    model = create_transformer(embedding_dim=768, embedding_dropout=0.1, vocab_size=40478,
                               max_len=min(512, max_len), use_attn_mask=use_attn_mask, trainable_pos_embedding=True,
                               num_heads=12, num_layers=12, use_one_embedding_dropout=use_one_embedding_dropout,
                               d_hid=4 * 768, attention_dropout=0.1, residual_dropout=0.1)
    model.set_weights(init_params)
    return model
Exemple #4
0
def load_google_bert(
        base_location: str = './google/downloads/multilingual_L-12_H-768_A-12/',
        use_attn_mask: bool = True,
        max_len: int = 512,
        verbose: bool = False) -> keras.Model:
    bert_config = BertConfig.from_json_file(base_location + 'bert_config.json')
    init_checkpoint = base_location + 'bert_model.ckpt'
    var_names = tf.train.list_variables(init_checkpoint)
    check_point = tf.train.load_checkpoint(init_checkpoint)
    vocab_size = bert_config.vocab_size - TextEncoder.BERT_SPECIAL_COUNT - TextEncoder.BERT_UNUSED_COUNT
    model = create_transformer(
        embedding_layer_norm=True,
        neg_inf=-10000.0,
        use_attn_mask=use_attn_mask,
        vocab_size=vocab_size,
        accurate_gelu=True,
        ln_epsilon=1e-12,
        max_len=max_len,
        use_one_embedding_dropout=True,
        d_hid=bert_config.intermediate_size,
        embedding_dim=bert_config.hidden_size,
        num_layers=bert_config.num_hidden_layers,
        num_heads=bert_config.num_attention_heads,
        residual_dropout=bert_config.hidden_dropout_prob,
        attention_dropout=bert_config.attention_probs_dropout_prob)
    if K.backend() == 'tensorflow':
        weights = [np.zeros(w.shape) for w in model.weights]
    else:
        weights = [np.zeros(w.get_value().shape) for w in model.weights]
    for var_name, _ in var_names:
        w_id = None
        qkv = None
        unsqueeze = False
        parts = var_name.split('/')
        first_vars_size = 5
        if parts[1] == 'embeddings':
            n = parts[-1]
            if n == 'token_type_embeddings':
                w_id = 0
            elif n == 'position_embeddings':
                w_id = 1
            elif n == 'word_embeddings':
                w_id = 2
            elif n == 'gamma':
                w_id = 3
            elif n == 'beta':
                w_id = 4
            else:
                raise ValueError()
        elif parts[2].startswith('layer_'):
            layer_number = int(parts[2][len('layer_'):])
            if parts[3] == 'attention':
                if parts[-1] == 'beta':
                    w_id = first_vars_size + layer_number * 12 + 5
                elif parts[-1] == 'gamma':
                    w_id = first_vars_size + layer_number * 12 + 4
                elif parts[-2] == 'dense':
                    if parts[-1] == 'bias':
                        w_id = first_vars_size + layer_number * 12 + 3
                    elif parts[-1] == 'kernel':
                        w_id = first_vars_size + layer_number * 12 + 2
                        unsqueeze = True
                    else:
                        raise ValueError()
                elif parts[-2] == 'key' or parts[-2] == 'query' or parts[
                        -2] == 'value':
                    w_id = first_vars_size + layer_number * 12 + (
                        0 if parts[-1] == 'kernel' else 1)
                    unsqueeze = parts[-1] == 'kernel'
                    qkv = parts[-2][0]
                else:
                    raise ValueError()
            elif parts[3] == 'intermediate':
                if parts[-1] == 'bias':
                    w_id = first_vars_size + layer_number * 12 + 7
                elif parts[-1] == 'kernel':
                    w_id = first_vars_size + layer_number * 12 + 6
                    unsqueeze = True
                else:
                    raise ValueError()
            elif parts[3] == 'output':
                if parts[-1] == 'beta':
                    w_id = first_vars_size + layer_number * 12 + 11
                elif parts[-1] == 'gamma':
                    w_id = first_vars_size + layer_number * 12 + 10
                elif parts[-1] == 'bias':
                    w_id = first_vars_size + layer_number * 12 + 9
                elif parts[-1] == 'kernel':
                    w_id = first_vars_size + layer_number * 12 + 8
                    unsqueeze = True
                else:
                    raise ValueError()

        if w_id is not None and qkv is None:
            if verbose:
                print(var_name, ' -> ', model.weights[w_id].name)
            if w_id == 1:  # pos embedding
                weights[w_id][:max_len, :] = check_point.get_tensor(
                    var_name
                )[:max_len, :] if not unsqueeze else check_point.get_tensor(
                    var_name)[None, :max_len, :]
            elif w_id == 2:  # word embedding
                # ours: unk, [vocab], pad, msk(mask), bos(cls), del(use sep again), eos(sep)
                # theirs: pad, 99 unused, unk, cls, sep, mask, [vocab]
                saved = check_point.get_tensor(
                    var_name)  # vocab_size, emb_size
                weights[w_id][0] = saved[1 + TextEncoder.BERT_UNUSED_COUNT]
                weights[w_id][1:vocab_size] = saved[-vocab_size + 1:]
                weights[w_id][vocab_size + TextEncoder.PAD_OFFSET] = saved[0]
                weights[w_id][vocab_size + TextEncoder.MSK_OFFSET] = saved[
                    4 + TextEncoder.BERT_UNUSED_COUNT]
                weights[w_id][vocab_size + TextEncoder.BOS_OFFSET] = saved[
                    2 + TextEncoder.BERT_UNUSED_COUNT]
                weights[w_id][vocab_size + TextEncoder.DEL_OFFSET] = saved[
                    3 + TextEncoder.BERT_UNUSED_COUNT]
                weights[w_id][vocab_size + TextEncoder.EOS_OFFSET] = saved[
                    3 + TextEncoder.BERT_UNUSED_COUNT]
            else:
                weights[w_id][:] = check_point.get_tensor(var_name) if not unsqueeze else \
                    check_point.get_tensor(var_name)[
                        None, ...]
        elif w_id is not None:
            if verbose:
                print(var_name, ' -> ', model.weights[w_id].name, '::', qkv)
            p = {'q': 0, 'k': 1, 'v': 2}[qkv]
            if weights[w_id].ndim == 3:
                dim_size = weights[w_id].shape[1]
                weights[w_id][0, :, p * dim_size:(p + 1) * dim_size] = check_point.get_tensor(
                    var_name) if not unsqueeze else \
                    check_point.get_tensor(var_name)[
                        None, ...]
            else:
                dim_size = weights[w_id].shape[0] // 3
                weights[w_id][p * dim_size:(p + 1) *
                              dim_size] = check_point.get_tensor(var_name)
        else:
            if verbose:
                print('not mapped: ', var_name
                      )  # TODO pooler, cls/predictions, cls/seq_relationship
    model.set_weights(weights)
    return model