Beispiel #1
0
    def __init__(self, backend="gnn"):
        super(Agent, self).__init__()
        self.d_model = 64
        heads = 4
        d_ff = self.d_model*2
        layers = 4

        if backend in {"transformer","alternate"}:
            self.encoder = TransformerEncoder(layers, heads, self.d_model,
                                              hidden_dimensionality=d_ff,
                                              alternate=(backend == "alternate"))
            backend = "transformer"
        elif backend == "torch_transformer":
            layer = TransformerEncoderLayer(self.d_model,
                                            heads,
                                            d_ff,
                                            dropout=0.0,
                                            activation="relu")
            self.encoder = TransformerEncoder(layer, layers, LayerNorm(self.d_model))
        elif backend == "gnn":
            self.encoder = BoundaryEncoder(layers=layers, H=self.d_model)
        elif backend == "random":
            self.encoder = None
        
        self.backend = backend

        number_of_actions = 3 # you can go to next vertex, subtract, or add
        if self.encoder is not None:
            self.predict = nn.Linear(self.d_model, number_of_actions)

        self.finalize()
    def __init__(self, emb_src, emb_tgt):
        super(TransormerNet, self).__init__()
        self.embeddings_src = emb_src
        self.embeddings_tgt = emb_tgt

        self.passage_encoder = TransformerEncoder(num_layers=2,
                                                  d_model=300,
                                                  heads=10,
                                                  d_ff=2048,
                                                  dropout=0.1,
                                                  embeddings=emb_src)
        self.query_encoder = TransformerEncoder(num_layers=2,
                                                d_model=300,
                                                heads=10,
                                                d_ff=2048,
                                                dropout=0.1,
                                                embeddings=emb_src)
        # self.query_encoder = self.passage_encoder

        self._matrix_attention = LegacyMatrixAttention()

        self.combine = CnnEncoder(embedding_dim=1200, num_filters=100)

        # self.max_
        self.linear = nn.Linear(200, 1)
        self.sigmoid = nn.Sigmoid()
Beispiel #3
0
    def __init__(self, num_inputs, action_space):
        super(ActorCritic, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)

        self.lstm = nn.LSTMCell(32 * 3 * 3, 256)
        self.encoder = TransformerEncoder(layers_count=layers_count,
                                          d_model=hidden_size,
                                          heads_count=heads_count,
                                          d_ff=d_ff,
                                          dropout_prob=dropout_prob)
        num_outputs = action_space.n
        self.critic_linear = nn.Linear(256, 1)
        self.actor_linear = nn.Linear(256, num_outputs)

        self.apply(weights_init)
        self.actor_linear.weight.data = normalized_columns_initializer(
            self.actor_linear.weight.data, 0.01)
        self.actor_linear.bias.data.fill_(0)
        self.critic_linear.weight.data = normalized_columns_initializer(
            self.critic_linear.weight.data, 1.0)
        self.critic_linear.bias.data.fill_(0)

        self.lstm.bias_ih.data.fill_(0)
        self.lstm.bias_hh.data.fill_(0)

        self.train()
    def __init__(self, args_dict):
        super().__init__()
        channels = args_dict['channels']
        self.num_layers = args_dict['num_layers']

        self.positional_encoder = PositionalEncoder(
            channels, args_dict['sequence_length'])

        encoder_layer = TransformerEncoderLayer(args_dict['channels'],
                                                args_dict['num_heads'],
                                                args_dict['feedforward_size'],
                                                args_dict['dropout'])
        encoder_norm = torch.nn.LayerNorm(args_dict['channels'])
        self.encoder = TransformerEncoder(encoder_layer,
                                          args_dict['num_layers'],
                                          encoder_norm)

        # self.attentions = nn.ModuleList()
        # self.dropout1 = nn.ModuleList()
        # self.norm1 = nn.ModuleList()
        # self.ff1 = nn.ModuleList()
        # self.ff2 = nn.ModuleList()
        # self.dropout2 = nn.ModuleList()
        # self.norm2 = nn.ModuleList()
        #
        # for _ in range(self.num_layers):
        #     self.attentions.append(MultiheadAttention(args_dict['num_heads'], channels))
        #     self.dropout1.append(nn.Dropout(args_dict['dropout']))
        #     self.norm1.append(nn.LayerNorm(channels))
        #     self.ff1.append(nn.Linear(channels, args_dict['feedforward_size']))
        #     self.ff2.append(nn.Linear(args_dict['feedforward_size'], channels))
        #     self.dropout2.append(nn.Dropout(args_dict['dropout']))
        #     self.norm2.append(nn.LayerNorm(channels))

        self.end_layer = nn.Linear(channels, args_dict['output_size'])
Beispiel #5
0
def make_model(src_dim, N=6,
               d_model=512, d_ff=2048, h=8, dropout=0.1, batch_size=10, n_class=15):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = TransformerEncoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_dim), c(position)),
        batch_size,
        d_model,
        n_class
    )

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model
Beispiel #6
0
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerLM, self).__init__()
        try:
            from transformer import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.0.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()
    def __init__(self):
        super().__init__()
        self.attn_enc = TransformerEncoder(39, 200, n_head=1, d_k=100, d_v=100)
        self.rnn_enc_1 = DynamicEncoder(78,
                                        200,
                                        n_layers=1,
                                        dropout=0.2,
                                        bidir=True)
        self.rnn_enc_2 = DynamicEncoder(200,
                                        200,
                                        n_layers=1,
                                        dropout=0.0,
                                        bidir=True)

        self.hir = 5
        self.out = nn.Linear(400, 20)
        self.hidden_size = 200
Beispiel #8
0
def build_model(layers_count, hidden_size, heads_count, d_ff, dropout_prob,
                max_len, vocabulary_size):
    token_embedding = nn.Embedding(num_embeddings=vocabulary_size,
                                   embedding_dim=hidden_size)
    positional_embedding = PositionalEmbedding(max_len=max_len,
                                               hidden_size=hidden_size)
    segment_embedding = SegmentEmbedding(hidden_size=hidden_size)

    encoder = TransformerEncoder(layers_count=layers_count,
                                 d_model=hidden_size,
                                 heads_count=heads_count,
                                 d_ff=d_ff,
                                 dropout_prob=dropout_prob)

    bert = BERT(encoder=encoder,
                token_embedding=token_embedding,
                positional_embedding=positional_embedding,
                segment_embedding=segment_embedding,
                hidden_size=hidden_size,
                vocabulary_size=vocabulary_size)

    return bert
Beispiel #9
0
 def __init__(
     self,
     emb_sz,
     n_classes,
     d_model,
     nhead,
     num_encoder_layers,
     dim_feedforward,
     dropout,
     batch_first,
 ):
     super(TransformerEncoderModel, self).__init__()
     layer = TransformerEncoderLayer(d_model,
                                     nhead,
                                     dim_feedforward,
                                     dropout,
                                     batch_first=batch_first)
     self.transformer_encoder = TransformerEncoder(layer,
                                                   num_encoder_layers)
     self.src_embedding = Embeddings(emb_sz, d_model)
     self.pos = PositionalEncoding(d_model, dropout)
     self.linear = nn.Linear(d_model, n_classes)
def make_model(args, word_vocab_size, tag_vocab_size, num_labels):
    """Initiliaze a the BiAffine parser according to the specs in args."""
    # Embeddings
    if args.use_chars:
        if args.char_encoder == 'rnn':
            word_embedding = RecurrentCharEmbedding(word_vocab_size,
                                                    args.word_emb_dim,
                                                    padding_idx=PAD_INDEX)
        elif args.char_encoder == 'cnn':
            word_embedding = ConvolutionalCharEmbedding(
                word_vocab_size,
                padding_idx=PAD_INDEX,
                filter_factor=args.filter_factor)
            args.word_emb_dim = word_embedding.output_size  # CNN encoder is not so flexible
            print(
                'CNN character model produces word embeddings of dimension {}.'
                .format(args.word_emb_dim))
        elif args.char_encoder == 'transformer':
            raise NotImplementedError(
                'Transformer character econder not yet implemented.')
    else:
        word_embedding = nn.Embedding(word_vocab_size,
                                      args.word_emb_dim,
                                      padding_idx=PAD_INDEX)
        if args.use_glove:
            raise NotImplementedError('GloVe embeddings not yet implemented.')
    # Words, tags, or both
    if args.disable_tags:
        embedding = WordEmbedding(word_embedding, args.emb_dropout)
        embedding_dim = args.word_emb_dim
    elif args.disable_words:  # Experimental reasons
        tag_embedding = nn.Embedding(tag_vocab_size,
                                     args.tag_emb_dim,
                                     padding_idx=PAD_INDEX)
        embedding = TagEmbedding(tag_embedding, args.emb_dropout)
        embedding_dim = args.tag_emb_dim
    else:
        tag_embedding = nn.Embedding(tag_vocab_size,
                                     args.tag_emb_dim,
                                     padding_idx=PAD_INDEX)
        embedding = WordTagEmbedding(word_embedding, tag_embedding,
                                     args.emb_dropout)
        embedding_dim = args.word_emb_dim + args.tag_emb_dim

    # Encoder
    if args.encoder == 'rnn':
        encoder = RecurrentEncoder(args.rnn_type,
                                   embedding_dim,
                                   args.rnn_hidden,
                                   args.rnn_num_layers,
                                   args.batch_first,
                                   args.rnn_dropout,
                                   bidirectional=True)
        encoder_dim = 2 * args.rnn_hidden
    elif args.encoder == 'cnn':
        encoder = ConvolutionalEncoder(embedding_dim,
                                       args.cnn_num_layers,
                                       args.kernel_size,
                                       dropout=args.cnn_dropout)
        encoder_dim = embedding_dim
    elif args.encoder == 'transformer':
        encoder = TransformerEncoder(embedding_dim,
                                     args.N,
                                     args.d_model,
                                     args.d_ff,
                                     args.h,
                                     dropout=args.trans_dropout)
        encoder_dim = args.d_model
    elif args.encoder == 'none':
        encoder = NoEncoder()
        encoder_dim = embedding_dim

    # Initialize the model.
    model = BiAffineParser(embedding, encoder, args.encoder, encoder_dim,
                           args.mlp_arc_hidden, args.mlp_lab_hidden,
                           args.mlp_dropout, num_labels, nn.CrossEntropyLoss)

    # Initialize parameters with Glorot.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)

    return model
Beispiel #11
0
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, num_heads=1, use_cuda=True, batch_size=50, dropout_input=0, dropout_hidden=0.5, embedding_dim=-1, position_embedding=False, shared_embedding=True, window_size=8, kernel_type='exp-1', contextualize_opt=None):
        super().__init__()
        
        self.device = torch.device('cuda' if use_cuda else 'cpu')
        
        self.embed = nn.Embedding(input_size, hidden_size, padding_idx=0).to(self.device)
        self.pe = PositionalEncoding(hidden_size, dropout_input,  max_len=window_size)

        if shared_embedding:
            self.out_matrix = self.embed.weight.to(self.device)
        else:
            self.out_matrix = nn.Parameter(torch.rand(output_size, hidden_size, requires_grad=True, device=self.device))

#         self.shared_key = torch.rand(1, batch_size, hidden_size, requires_grad=True).to(self.device)
                
        encoder_layer = TransformerEncoderLayer(hidden_size, num_heads, dim_feedforward=2048, dropout=dropout_hidden)
        norm = nn.LayerNorm(hidden_size)
        self.encoder = TransformerEncoder(encoder_layer, num_layers, norm=norm).to(self.device)
        
        self.decoder = MultiheadAttention(hidden_size, num_heads, dropout=dropout_hidden)
        
        parts = kernel_type.split('-')
        kernel_types = []

        self.params = []
        for i in range( len(parts) ):
            pi = parts[i]
            if pi in {'exp', 'exp*', 'log', 'lin', 'exp^', 'exp*^', 'log^', 'lin^', 'ind', 'const', 'thres'}:
                if pi.endswith('^'):
                    var = (nn.Parameter(torch.rand(1, requires_grad=True, device=self.device)*5+10 ), nn.Parameter(torch.rand(1, requires_grad=True, device=self.device)))
                    kernel_types.append( pi[:-1] )
                else:
                    var = (nn.Parameter(torch.rand(1, requires_grad=True, device=self.device)*0.01), nn.Parameter(torch.rand(1, requires_grad=True, device=self.device) )  )
                    kernel_types.append( pi )
                    
                self.register_parameter(pi+str(len(self.params))+'_0',  var[0] )
                self.register_parameter(pi+str(len(self.params))+'_1',  var[1] )
                
                self.params.append( var )
                
            elif pi.isdigit():
                val = int(pi)
                if val > 1:
                    pi = parts[i-1]
                    for j in range(val-1):
                        if pi.endswith('^'):
                            var = (nn.Parameter(torch.rand(1, requires_grad=True, device=self.device)*5+10 ), nn.Parameter(torch.rand(1, requires_grad=True, device=self.device)))
                            kernel_types.append( pi[:-1] )
                        else:
                            var = (nn.Parameter(torch.rand(1, requires_grad=True, device=self.device)*0.01), nn.Parameter(torch.rand(1, requires_grad=True, device=self.device) )  )
                            kernel_types.append( pi )
                            
                        
                        self.register_parameter(pi+str(len(self.params))+'_0',  var[0] )
                        self.register_parameter(pi+str(len(self.params))+'_1',  var[1] )
                        
                        self.params.append( var )

            else:
                print('no matching kernel '+ pi) 
                
        self.kernel_num = len(kernel_types)
        print(kernel_types, self.params)
            
        def decay_constructor(t):
            kernels = []
            for i in range( self.kernel_num ):
                pi = kernel_types[i]
                if pi == 'log':
                    kernels.append( torch.mul( self.params[i][0] , torch.log1p(t) ) + self.params[i][1] )
                elif pi == 'exp':
                    kernels.append(  1000* torch.exp( torch.mul( self.params[i][0], torch.neg( t ) ) ) + self.params[i][1] )
                elif pi == 'exp*':
                    kernels.append(  torch.mul( self.params[i][0], torch.exp( torch.neg( t ) ) ) + self.params[i][1] )
                elif pi == 'lin':
                    kernels.append( self.params[i][0] * t  + self.params[i][1] )
                elif pi == 'ind':
                    kernels.append( t )
                elif pi == 'const':
                    kernels.append( torch.ones(t.size(), device=self.device ) )
                elif pi == 'thres':
                    kernels.append( torch.reciprocal( 1 + torch.exp( -self.params[i][0] * t + self.params[i][1] ) )  )
                    
            return torch.stack( kernels, dim=2)
                
        self.decay = decay_constructor   
            
        self.contextualize_opt = contextualize_opt
        if self.contextualize_opt == 'item_subspace':
            subspace_size = 10
            bidirectional = True
            self.gru = nn.GRU(hidden_size, subspace_size, num_layers=1, dropout=dropout_hidden, batch_first=True, bidirectional=bidirectional)
            self.gru2context = nn.Linear( 20 , self.kernel_num)

        
        self.hidden_size = hidden_size
        self.batch_size = batch_size
    
        self = self.to(self.device)
    def __init__(self,
                 input_vocab,
                 target_vocab,
                 d_model=512,
                 d_int=2048,
                 d_k=64,
                 h=8,
                 n_layers=6,
                 dropout_rate=0.1,
                 max_len_pe=200,
                 bert_name=None):
        """
        :param input_vocab: Vocab based on BERT tokenizer
        :param target_vocab: Vocab based on BERT tokenizer, requires embedding. Fields tokenizer, tokenizer.ids_to_tokens = ordered_dict
        pad=0, start=1, end=2
        :param size: Size of the BERT model: base or large
        :param d_model: dimension of transformer embeddings #TODO add linear layer to map BERT output to dim 512?
        :param dropout_rate:dropout, default 0.1
        """
        super(TSP, self).__init__()
        self.dropout_rate = dropout_rate
        self.input_vocab = input_vocab
        self.target_vocab = target_vocab
        self.model_embeddings_source = nn.Sequential(
            DecoderEmbeddings(vocab=self.input_vocab, embed_size=d_model),
            PositionalEncoding(d_model=d_model,
                               dropout=dropout_rate,
                               max_len=max_len_pe))
        self.model_embeddings_target = nn.Sequential(
            DecoderEmbeddings(vocab=self.target_vocab, embed_size=d_model),
            PositionalEncoding(d_model=d_model,
                               dropout=dropout_rate,
                               max_len=max_len_pe))
        self.encoder = TransformerEncoder(layer=EncoderLayer(
            d_model=d_model,
            d_int=d_int,
            d_k=d_k,
            d_v=d_k,
            h=h,
            p_drop=dropout_rate),
                                          n_layer=n_layers)
        self.decoder = Transformer(layer=DecoderLayer(d_model=d_model,
                                                      d_int=d_int,
                                                      d_k=d_k,
                                                      d_v=d_k,
                                                      h=h,
                                                      p_drop=dropout_rate),
                                   n_layer=n_layers)
        self.linear_projection = nn.Linear(
            d_model,
            len(self.target_vocab.tokenizer.ids_to_tokens),
            bias=False)
        self.dropout = nn.Dropout(self.dropout_rate)

        self.device = self.linear_projection.weight.device

        initialize_weights(self.encoder)
        initialize_weights(self.decoder)
        initialize_weights(self.linear_projection)
        initialize_weights(self.model_embeddings_source)
        initialize_weights(self.model_embeddings_target)
Beispiel #13
0
def load_google_bert(path,
                     keep_all_bert_tokens=True,
                     special_tokens=None,
                     add_special_token_to_begin=True,
                     num_segments=2,
                     use_attn_mask=True,
                     max_len=512,
                     verbose=False,
                     use_pooler=False,
                     use_masked_lm=False,
                     use_next_sp=False,
                     is_training=True,
                     **kwargs):
    r"""Load the pretrained weights of the Google BERT model. Nothe that
    their vocab is as: # ``pad, 99 unused, unk, cls, sep, mask, [vocab]``
    in case you may want to specify your own special token mapping

    Inputs:
        ``path`` (str): the path containing the pretrained model
        ``keep_all_bert_tokens``: whether or not to keep the original
    vocab embeddings of BERT with all its unused/special tokens
        ``special_tokens`` (List[(Token, Index_in_Bert_Vocab)]):
    the special tokens of mapping for your problem. only take into account
    if `keep_all_bert_tokens` is False.
    E.g., ('PAD', 0), ('MSK', 103), ('BOS', 101), ('DEL', 102), ('EOS', 102)
    Default: None
        ``add_special_token_to_begin``: if True, add the special token at
    the begin so that vocab as [special_tokens, [vocab]] otherwise we have
    [[vocab], special_tokens]. only take into account if
    `keep_all_bert_tokens` is False.
        ``num_segments`` (int): number of segments. if set to zero,
    then the segment
    embeddings  won't be performed. Default: 2.
        ``use_attn_mask`` (bool): whether or not the layer expects to use
    attention mask in the
    computation. Default: ``True``.
        ``max_len`` (int): maximum length of the input sequence. Default: 512.
        ``use_pooler`` (bool): whether or not to compute the pooled
    representation of the input sequnces. Default: ``False``.
        ``use_masked_lm`` (bool): whether or not to compute the masked
    language modeling outputs. Default: ``False``.
        ``use_next_sp`` (bool): whether or not to compute the outputs
    of the next sentence prediction task. Default: ``False``.
        ``is_training`` (bool): whether or not the model is instantiated for
    training purposes

    Outputs:
        ``model``: the ``TransformerEncoder`` model instantiated with the
    pretrained weights

    """

    if not use_pooler:
        use_next_sp = False

    BERT_SPECIAL_COUNT = 4
    BERT_UNUSED_COUNT = 99

    if special_tokens is None:
        special_tokens = []
    special_count = len(special_tokens)

    bert_config = BertConfig.from_json_file(path + 'bert_config.json')
    init_checkpoint = path + 'bert_model.ckpt'
    var_names = tf.train.list_variables(init_checkpoint)
    check_point = tf.train.load_checkpoint(init_checkpoint)
    if keep_all_bert_tokens:
        vocab_size = bert_config.vocab_size - special_count
    else:
        vocab_size = bert_config.vocab_size - BERT_SPECIAL_COUNT - BERT_UNUSED_COUNT

    if 'neg_inf' not in kwargs:
        kwargs['neg_inf'] = float(-1e4)
    if 'use_one_embedding_dropout' not in kwargs:
        kwargs['use_one_embedding_dropout'] = True
    if 'layer_norm_epsilon' not in kwargs:
        kwargs['layer_norm_epsilon'] = 1e-12
    if 'embedding_dropout' not in kwargs:
        kwargs['embedding_dropout'] = 0.1
    if 'attention_dropout' not in kwargs:
        kwargs['attention_dropout'] = bert_config.attention_probs_dropout_prob
    if 'residual_dropout' not in kwargs:
        kwargs['residual_dropout'] = bert_config.hidden_dropout_prob
    if 'task_dropout' not in kwargs:
        kwargs['task_dropout'] = 0.1
    if 'use_gelu' not in kwargs:
        kwargs['use_gelu'] = True
    if 'accurate_gelu' not in kwargs:
        kwargs['accurate_gelu'] = True
    if 'use_pad_mask' not in kwargs:
        kwargs['use_pad_mask'] = False
    kwargs['vocab_size'] = vocab_size + special_count
    kwargs['n_layers'] = bert_config.num_hidden_layers
    kwargs['d_model'] = bert_config.hidden_size
    kwargs['d_inner'] = bert_config.intermediate_size
    kwargs['n_head'] = bert_config.num_attention_heads
    kwargs['d_k'] = bert_config.hidden_size // bert_config.num_attention_heads
    kwargs['d_v'] = bert_config.hidden_size // bert_config.num_attention_heads
    kwargs['d_out'] = bert_config.hidden_size
    kwargs['num_segments'] = num_segments
    kwargs['max_len'] = min(512, max_len)
    kwargs['embedding_layer_norm'] = True
    kwargs['trainable_pos_embedding'] = True

    if not is_training:
        kwargs['embedding_dropout'] = 0.0
        kwargs['attention_dropout'] = 0.0
        kwargs['residual_dropout'] = 0.0
        kwargs['task_dropout'] = 0.0

    model = TransformerEncoder(use_attn_mask=use_attn_mask,
                               use_pooler=use_pooler,
                               use_masked_lm=use_masked_lm,
                               use_next_sp=use_next_sp,
                               **kwargs)

    maxi_len = min(512, max_len)
    input_shape = [(None, maxi_len)]
    if num_segments > 0:
        input_shape.append((None, maxi_len))
    input_shape.append((None, maxi_len))
    if use_attn_mask:
        input_shape.append((None, 1, maxi_len, maxi_len))
    if 'use_pad_mask' in kwargs and kwargs['use_pad_mask']:
        input_shape.append((None, maxi_len, 1))

    model.build(input_shape)

    # weights = [np.zeros(w.shape) for w in model.weights]
    weights = [w for i, w in enumerate(model.get_weights())]
    if verbose:
        print('weight num: ', len(weights))

    for var_name, _ in var_names:
        w_id = None
        qkv = None
        unsqueeze = False
        transpose = False
        lm_flag = False
        parts = var_name.split('/')
        beg_off = 0 if num_segments > 0 else -1  # no segments
        first_vars_size = 5 + beg_off
        if parts[1] == 'embeddings':
            n = parts[-1]
            if n == 'token_type_embeddings':
                if num_segments <= 0:
                    continue
                w_id = 0 + beg_off
            elif n == 'position_embeddings':
                w_id = 1 + beg_off
            elif n == 'word_embeddings':
                w_id = 2 + beg_off
            elif n == 'gamma':
                w_id = 3 + beg_off
            elif n == 'beta':
                w_id = 4 + beg_off
            else:
                raise ValueError()
        elif parts[2].startswith('layer_'):
            layer_number = int(parts[2][len('layer_'):])
            if parts[3] == 'attention':
                if parts[-1] == 'beta':
                    w_id = first_vars_size + layer_number * 12 + 5
                elif parts[-1] == 'gamma':
                    w_id = first_vars_size + layer_number * 12 + 4
                elif parts[-2] == 'dense':
                    if parts[-1] == 'bias':
                        w_id = first_vars_size + layer_number * 12 + 3
                    elif parts[-1] == 'kernel':
                        w_id = first_vars_size + layer_number * 12 + 2
                        unsqueeze = True
                    else:
                        raise ValueError()
                elif ((parts[-2] == 'key') or (parts[-2] == 'query')
                      or (parts[-2] == 'value')):
                    tmp = (0 if parts[-1] == 'kernel' else 1)
                    w_id = first_vars_size + layer_number * 12 + tmp
                    unsqueeze = parts[-1] == 'kernel'
                    qkv = parts[-2][0]
                else:
                    raise ValueError()
            elif parts[3] == 'intermediate':
                if parts[-1] == 'bias':
                    w_id = first_vars_size + layer_number * 12 + 7
                elif parts[-1] == 'kernel':
                    w_id = first_vars_size + layer_number * 12 + 6
                    unsqueeze = True
                else:
                    raise ValueError()
            elif parts[3] == 'output':
                if parts[-1] == 'beta':
                    w_id = first_vars_size + layer_number * 12 + 11
                elif parts[-1] == 'gamma':
                    w_id = first_vars_size + layer_number * 12 + 10
                elif parts[-1] == 'bias':
                    w_id = first_vars_size + layer_number * 12 + 9
                elif parts[-1] == 'kernel':
                    w_id = first_vars_size + layer_number * 12 + 8
                    unsqueeze = True
                else:
                    raise ValueError()
        elif parts[1] == 'pooler':
            if use_pooler:
                layer_number = bert_config.num_hidden_layers
                if parts[-1] == 'bias':
                    w_id = first_vars_size + layer_number * 12 + 1
                elif parts[-1] == 'kernel':
                    w_id = first_vars_size + layer_number * 12
                    unsqueeze = True
                else:
                    raise ValueError()
        elif parts[1] == 'predictions':
            layer_number = bert_config.num_hidden_layers
            base_offset = first_vars_size + layer_number * 12 + (
                2 if use_pooler else 0)
            if use_masked_lm:
                if parts[-1] == 'output_bias':
                    w_id = base_offset
                    lm_flag = True
                elif parts[-1] == 'gamma':
                    w_id = base_offset + 1
                elif parts[-1] == 'beta':
                    w_id = base_offset + 2
                elif parts[-1] == 'bias':
                    w_id = base_offset + 4
                elif parts[-1] == 'kernel':
                    w_id = base_offset + 3
                    unsqueeze = True
                else:
                    raise ValueError()
        elif parts[1] == 'seq_relationship':
            layer_number = bert_config.num_hidden_layers
            base_offset = first_vars_size + layer_number * 12 + (
                2 if use_pooler else 0)
            if use_masked_lm:
                base_offset += 6
            if use_next_sp:
                if parts[-1] == 'output_bias':
                    w_id = base_offset + 1
                elif parts[-1] == 'output_weights':
                    w_id = base_offset
                    unsqueeze = False
                    transpose = True
                else:
                    raise ValueError()

        if w_id is not None and qkv is None:
            if verbose:
                print('w_id: ', w_id)
                print(var_name, ' -> ', model.weights[w_id].name)

            tr_id = w_id - beg_off

            if tr_id == 0:  # segment embedding
                if num_segments > 0:
                    num_seg = min(num_segments, 2)
                    weights[w_id][:num_seg, :] = check_point.get_tensor(
                        var_name
                    )[:num_seg, :] if not unsqueeze else check_point.get_tensor(
                        var_name)[None, :num_seg, :]

            elif tr_id == 1:  # pos embedding
                weights[w_id][:max_len, :] = check_point.get_tensor(
                    var_name
                )[:max_len, :] if not unsqueeze else check_point.get_tensor(
                    var_name)[None, :max_len, :]

            elif tr_id == 2:  # word embedding
                # ours: unk, [vocab], pad, msk(mask), bos(cls),
                #       del(use sep again), eos(sep)
                # theirs: pad, 99 unused, unk, cls, sep, mask, [vocab]

                # vocab_size, emb_size
                saved = check_point.get_tensor(var_name)

                if keep_all_bert_tokens:
                    weights[w_id][:] = saved
                else:
                    weights_vocab = saved[-vocab_size:]
                    if special_count > 0:
                        for i in range(len(special_tokens)):
                            idx = i
                            if not add_special_token_to_begin:
                                idx += vocab_size
                            assert special_tokens[i][
                                1] <= 103 and special_tokens[i][1] >= 0
                            weights[w_id][idx] = saved[special_tokens[i][1]]
                    if not add_special_token_to_begin:
                        idx = 0
                    else:
                        idx = special_count
                    weights[w_id][idx:vocab_size + idx] = weights_vocab

            elif lm_flag:
                # ours: unk, [vocab], pad, msk(mask), bos(cls),
                #       del(use sep again), eos(sep)
                # theirs: pad, 99 unused, unk, cls, sep, mask, [vocab]

                saved = check_point.get_tensor(var_name)

                if keep_all_bert_tokens:
                    weights[w_id][:] = saved
                else:
                    weights_vocab = saved[-vocab_size:]
                    if special_count > 0:
                        for i in range(len(special_tokens)):
                            idx = i
                            if not add_special_token_to_begin:
                                idx += vocab_size
                            assert special_tokens[i][
                                1] <= 103 and special_tokens[i][1] >= 0
                            weights[w_id][idx] = saved[special_tokens[i][1]]
                    if not add_special_token_to_begin:
                        idx = 0
                    else:
                        idx = special_count
                    weights[w_id][idx:vocab_size + idx] = weights_vocab

            else:
                if not transpose:
                    weights[w_id][:] = check_point.get_tensor(
                        var_name) if not unsqueeze else check_point.get_tensor(
                            var_name)[None, ...]
                else:
                    w_temp = check_point.get_tensor(
                        var_name) if not unsqueeze else check_point.get_tensor(
                            var_name)[None, ...]
                    weights[w_id][:] = np.transpose(w_temp)

        elif w_id is not None:
            if verbose:
                print('w_id: ', w_id)
                print(var_name, ' -> ', model.weights[w_id].name, '::', qkv)

            p = {'q': 0, 'k': 1, 'v': 2}[qkv]
            if weights[w_id].ndim == 3:
                dim_size = weights[w_id].shape[1]
                weights[w_id][
                    0, :,
                    p * dim_size:(p + 1) * dim_size] = check_point.get_tensor(
                        var_name) if not unsqueeze else check_point.get_tensor(
                            var_name)[None, ...]
            else:
                dim_size = weights[w_id].shape[0] // 3
                weights[w_id][p * dim_size:(p + 1) *
                              dim_size] = check_point.get_tensor(var_name)
        else:
            if verbose:
                # TODO cls/predictions, cls/seq_relationship
                print('not mapped: ', var_name)
    model.set_weights(weights)
    return model
Beispiel #14
0
def load_openai_transformer(path,
                            special_tokens=None,
                            add_special_token_to_begin=True,
                            num_segments=2,
                            use_attn_mask=True,
                            max_len=512,
                            use_one_embedding_dropout=False,
                            is_training=True,
                            **kwargs):
    r"""Load the pretrained weights of the OpenAI model.

    Inputs:
        ``path`` (str): the path containing the pretrained model
        ``special_tokens`` (int or List[(Token, Index_in_Bert_Vocab)]):
    the special tokens of mapping for your problem.
    E.g., ('PAD', 0), ('MSK', 103), ('BOS', 101), ('DEL', 102), ('EOS', 102)
    Default: None
        ``add_special_token_to_begin``: if True, add the special token at
    the begin so that vocab as [special_tokens, [vocab]] otherwise we have
    [[vocab], special_tokens].
        ``num_segments`` (int): number of segments. if set to zero,
    then the segment
    embeddings  won't be performed. Default: 2.
        ``use_attn_mask`` (bool): whether or not the layer expects to use
    attention mask in the
    computation. Default: ``True``.
        ``max_len`` (int): maximum length of the input sequence. Default: 512.
        ``use_one_embedding_dropout``(bool): if ``True``, the different
    embeddings will be
    summed up before applying dropout, otherwise dropout will be applied
    to each embedding type independently before summing them.
    Default: ``False``.
        ``is_training`` (bool): whether or not the model is instantiated for
    training purposes

    Outputs:
        ``model``: the ``TransformerEncoder`` model instantiated with the
    pretrained weights

    """

    if special_tokens is None:
        special_tokens = []
    if isinstance(special_tokens, (list, tuple)):
        special_count = len(special_tokens)
    else:
        special_count = int(special_tokens)

    with open(path + 'params_shapes.json') as f:
        shapes = json.load(f)

    offsets = np.cumsum([np.prod(shape) for shape in shapes])
    init_params = [
        np.load(path + 'params_{}.npy'.format(n)) for n in range(10)
    ]
    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
    init_params = [
        param.reshape(shape) for param, shape in zip(init_params, shapes)
    ]
    init_params[0] = init_params[0][:min(512, max_len)]

    # add special token embedding to token embedding
    # the special tokens are added at the end of the [vocab]
    if special_count > 0:
        if not add_special_token_to_begin:
            init_params[1] = np.concatenate(
                (init_params[1],
                 np.random.randn(special_count, 768).astype(np.float32) *
                 0.02),
                axis=0)
        else:
            init_params[1] = np.concatenate(
                (np.random.randn(special_count, 768).astype(np.float32) * 0.02,
                 init_params[1]),
                axis=0)

    if num_segments > 0:
        # adding parameters for segment embeddings if needed
        init_params = [np.zeros((num_segments, 768)).astype(np.float32)
                       ] + init_params  # segment embedding

    kwargs['vocab_size'] = 40478 + special_count
    kwargs['n_layers'] = 12
    kwargs['d_model'] = 768
    kwargs['d_inner'] = 768 * 4
    kwargs['n_head'] = 12
    kwargs['d_k'] = 768 // 12
    kwargs['d_v'] = 768 // 12
    kwargs['d_out'] = 768
    kwargs['num_segments'] = num_segments
    kwargs['max_len'] = min(512, max_len)
    kwargs['embedding_layer_norm'] = False
    kwargs['trainable_pos_embedding'] = True

    if 'neg_inf' not in kwargs:
        kwargs['neg_inf'] = -1e9
    if 'layer_norm_epsilon' not in kwargs:
        kwargs['layer_norm_epsilon'] = 1e-5
    if 'embedding_dropout' not in kwargs:
        kwargs['embedding_dropout'] = 0.1
    if 'attention_dropout' not in kwargs:
        kwargs['attention_dropout'] = 0.1
    if 'residual_dropout' not in kwargs:
        kwargs['residual_dropout'] = 0.1
    if 'task_dropout' not in kwargs:
        kwargs['task_dropout'] = 0.1
    if 'use_gelu' not in kwargs:
        kwargs['use_gelu'] = True
    if 'accurate_gelu' not in kwargs:
        kwargs['accurate_gelu'] = False
    if 'use_pad_mask' not in kwargs:
        kwargs['use_pad_mask'] = False

    if not is_training:
        kwargs['embedding_dropout'] = 0.0
        kwargs['attention_dropout'] = 0.0
        kwargs['residual_dropout'] = 0.0
        kwargs['task_dropout'] = 0.0

    model = TransformerEncoder(
        use_one_embedding_dropout=use_one_embedding_dropout,
        use_attn_mask=use_attn_mask,
        **kwargs)
    maxi_len = min(512, max_len)
    input_shape = [(None, maxi_len)]
    if num_segments > 0:
        input_shape.append((None, maxi_len))
    input_shape.append((None, maxi_len))

    if use_attn_mask:
        input_shape.append((None, 1, maxi_len, maxi_len))
    if 'use_pad_mask' in kwargs and kwargs['use_pad_mask']:
        input_shape.append((None, maxi_len, 1))

    model.build(input_shape)

    n_params = len(init_params)

    weights = [
        None if i < n_params else w for i, w in enumerate(model.get_weights())
    ]

    weights[:n_params] = init_params[:]

    # model.set_weights(init_params)
    model.set_weights(weights)

    return model