Beispiel #1
0
class TransformerModel(nn.Module):
    def __init__(self, dim_in, units, nhead, dim_out, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.emb = nn.Linear(dim_in, units)
        encoder_layers = TransformerEncoderLayer(units, nhead)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.decoder = nn.Linear(units, dim_out)
        self.optimizer = optim.Adam(chain(
            self.transformer_encoder.parameters(), self.emb.parameters(),
            self.decoder.parameters()),
                                    lr=0.01,
                                    betas=(0.5, 0.999))
        self.cuda = False if not torch.cuda.is_available() else True
        self.activation = F.relu
        self.dropout = nn.Dropout(dropout)
        self.Tensor = torch.cuda.FloatTensor if self.cuda else torch.Tensor
        self.softmax = nn.LogSoftmax(dim=1)
        # self.loss_fn = EMDLoss()
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, src):
        src = self.emb(src)
        encoder_output = self.transformer_encoder(self.activation(src))
        output = self.decoder(encoder_output.mean(dim=1))
        # output = self.decoder(encoder_output[:,0])
        return output

    def train(self, batch):
        label = batch["label"]
        if self.cuda:
            target = move_to_gpu(label)
        else:
            target = label
        src = Variable(batch['x'].type(self.Tensor))
        logits = self.forward(src)
        loss = self.loss_fn(logits, target)
        self.optimizer.zero_grad()
        # logits.retain_grad()
        loss.backward()
        # print("grad", logits.grad)
        self.optimizer.step()
        metrics = {
            "loss": loss.item(),
        }
        return metrics

    def predict(self, batch):
        src = Variable(batch['x'].type(self.Tensor))
        logits = self.forward(src)
        scores = self.softmax(logits)
        scores = torch.exp(scores)
        return scores
class Transformer(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 d_model: int,
                 nhead: int,
                 d_feedforward: int,
                 nlayers: int,
                 dropout: float = 0.5,
                 max_len: int = 5,
                 norm_first=False,
                 use_norm=False,
                 **kwargs):
        super().__init__()
        self.vocab_size = vocab_size
        self.model_type = 'TransformerDecoder'
        self.target_pad_idx = 3
        self.pos_encoder = PositionalEncoding(d_model,
                                              dropout,
                                              max_len=max_len)

        transformer_layers = TransformerEncoderLayer(
            d_model,
            nhead,
            dim_feedforward=d_feedforward,
            dropout=dropout,
            norm_first=norm_first,
            **kwargs)
        if use_norm:
            transformer_norm = LayerNorm(d_model, eps=1e-5, **kwargs)
        else:
            transformer_norm = None
        self.transformer_block = TransformerEncoder(transformer_layers,
                                                    nlayers, transformer_norm)

        self.encoder = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, vocab_size)
        self.max_len = max_len
        self.criterion = nn.CrossEntropyLoss(ignore_index=self.target_pad_idx)

        self._reset_parameters()
        self.init_weights()
        self.input_attn_mask = self.generate_square_subsequent_mask(
            self.max_len)

    @staticmethod
    def generate_square_subsequent_mask(sz: int) -> Tensor:
        r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
        """
        return torch.triu(torch.full((sz, sz), float('-inf')), diagonal=1)

    def _reset_parameters(self):
        r"""Initiate parameters in the transformer model."""

        for p in self.transformer_block.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, x: Tensor, x_lens=None) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]
        Returns:
            output Tensor of shape [seq_len, batch_size, vocab_size]
        """
        x = x.transpose(0, 1)
        src = self.encoder(x) * math.sqrt(self.d_model)
        # src = self.pos_encoder(src)
        self.input_attn_mask = self.generate_square_subsequent_mask(
            src.shape[0])
        output = self.transformer_block(src, self.input_attn_mask)
        output = self.decoder(output)
        # pred = output[-1]  # torch.softmax(output[-1], 1)
        return output.transpose(0, 1).contiguous()

    def compute_loss(self, preds, targets):
        targets = targets.view(-1, self.vocab_size)
        preds = preds.view(-1, self.vocab_size)
        loss = self.criterion(preds, targets.argmax(dim=1))
        return loss
Beispiel #3
0
class MusicBert(torch.nn.Module):

    def __init__(self, dim_sound, num_tokens=30522, dim_model=768, nhead=12, name="music_bert",\
                 num_encoder_layers=12, d_feed=3072, dropout=0.1, n_convs=4):
        super(MusicBert, self).__init__()

        self.d_model = dim_model
        self.num_tokens = num_tokens

        self.PATH = "models/" + name + ".pth"

        # Token Embedder
        self.embedder = nn.Embedding(num_tokens, dim_model)

        self.sound_compressor = SoundCNNEncoder([dim_sound] + [dim_model] *
                                                (n_convs - 1))
        self.sound_decompressor = SoundCNNDecoder([dim_model] * n_convs)

        self.position_embeddings = PositionalEncodings(dim_model, dropout)

        encoder_layer = TransformerEncoderLayer(dim_model, nhead, \
                                                dim_feedforward=d_feed, \
                                                dropout=dropout,
                                                activation='gelu')

        device = self.get_device()

        # Registers IDs in the buffers memory
        self.register_buffer("CLS_ID", torch.tensor([[101]]).to(device).long())
        self.register_buffer("SEP_ID", torch.tensor([[102]]).to(device).long())

        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers)

    def set_encoder_grad(self, grad):
        for param in self.encoder.parameters():
            param.requires_grad = grad

    def set_convs_grad(self, grad):
        for param in self.sound_compressor.parameters():
            param.requires_grad = grad

        for param in self.sound_decompressor.parameters():
            param.requires_grad = grad

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)

    def get_device(self):
        return next(self.parameters()).device

    def load_pretrained(self):
        #load pretrained model from Hugginface

        device = self.get_device()
        bertModel = BertModel.from_pretrained("bert-base-uncased").to(device)

        bert_state_dict = translate_from_hugginface_to_torch_BERT(
            bertModel.state_dict())
        self.load_state_dict(bert_state_dict, strict=False)

        self.__test_integrity()

    def __test_integrity(self):

        self.eval()
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        BERTmodel = BertModel.from_pretrained('bert-base-uncased').to(
            self.get_device())

        input_ids = torch.tensor(
            tokenizer.encode("this is a test")).unsqueeze(0).to(
                self.get_device())
        BERT_outputs = BERTmodel(input_ids)[0]
        my_outputs = self(src_tokens=input_ids.permute(1, 0),
                          add_special_tokens=False).permute(1, 0, 2)

        outputs_mean = torch.mean(my_outputs - BERT_outputs)
        outputs_norm = torch.norm(my_outputs - BERT_outputs)

        print("== INTEGRITY TEST ==")
        print(f"Mean error: {outputs_mean}")
        print(f"Norm error: {outputs_norm}")
        self.train()

    def save_model(self, path=None):
        path = path if path is not None else self.PATH
        torch.save(self.state_dict(), path)

    def load_model(self, path=None):
        path = path if path is not None else self.PATH
        self.load_state_dict(torch.load(path))

    def forward(self,
                src_sound=None,
                src_tokens=None,
                add_special_tokens=True,
                skip_encoder=False):

        assert src_tokens is not None or src_sound is not None, \
            "Feed at least one sound or token sequence"

        # SHAPES: sound  Seq_len, Batch, Feat
        #         tokens Seq_len, Batch

        sound_length = 0
        tokens_length = 0

        ## "Preprocess" the two sequences to the same Vector space
        if src_sound is not None:
            batch_size = src_sound.shape[1]
            sound_length = src_sound.shape[0]

            src_sound = self.sound_compressor(src_sound)
            src_sound = self.position_embeddings(src_sound, token_type="sound")
        else:
            # if undefined creates an empty placeholder for the following operations
            src_sound = torch.Tensor([]).to(self.get_device())

        if src_tokens is not None:
            batch_size = src_tokens.shape[1]
            tokens_length = src_tokens.shape[0]
        else:
            # if undefined creates an empty placeholder for the following operations
            src_tokens = torch.Tensor([]).to(self.get_device())

        if add_special_tokens:
            CLS = self.CLS_ID.repeat(1, batch_size)
            SEP = self.SEP_ID.repeat(1, batch_size)
            src_tokens = torch.cat((CLS, src_tokens.long(), SEP), 0)
            tokens_length += 2  # we added 2 tokens in the token_sequence

        if tokens_length != 0:
            src_tokens = self.embedder(src_tokens.long())
            src_tokens = self.position_embeddings(src_tokens,
                                                  token_type="text")

        sequence = torch.cat((src_tokens, src_sound), 0)

        if skip_encoder:
            output = sequence
        else:
            output = self.encoder(sequence)

        if sound_length > 0:
            # if sound was provided, decompresses the sound_sequence
            sound_sequence = output[tokens_length:]
            token_sequence = output[:tokens_length]

            sound_sequence = self.sound_decompressor(sound_sequence,
                                                     target_len=sound_length)
            output = torch.cat((token_sequence, sound_sequence), 0)

        assert output.shape[0] == tokens_length + sound_length,\
               "Expecting that input and output sequences to have the same length, but got {:d} and {:d}"\
               .format(output.shape[0], tokens_length + sound_length)

        return output