Example #1
0
    def val_dataloader(self):
        from data import IndicDataset, PadSequence
        pad_sequence = PadSequence(self.tokenizers.src.pad_token_id, self.tokenizers.tgt.pad_token_id)

        return DataLoader(IndicDataset(self.tokenizers.src, self.tokenizers.tgt, self.config.data, False), 
                           batch_size=self.config.eval_size, 
                           shuffle=False, 
                           collate_fn=pad_sequence)
Example #2
0
    def __init__(self, config):
        super(LightModule, self).__init__()
        self.hparam = config
        init_seed()
        preproc_data()

        self.model, self.tokenizers = M.build_model(config)
        self.pad_sequence = PadSequence(self.tokenizers.src.pad_token_id,
                                        self.tokenizers.tgt.pad_token_id)
        print('init success')
Example #3
0
    def test_dataloader(self):
        from data import IndicDataset, PadSequence
        pad_sequence = PadSequence(self.src_tokenizers.pad_token_id,
                                   self.tgt_tokenizers.pad_token_id)

        return DataLoader(IndicDataset(self.src_tokenizers,
                                       self.tgt_tokenizers, self.config.data,
                                       False, True),
                          batch_size=1,
                          shuffle=False,
                          collate_fn=pad_sequence)
Example #4
0
    def __init__(self,config):

        super().__init__() 
        src_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        tgt_tokenizer.bos_token = '<s>'
        tgt_tokenizer.eos_token = '</s>'
        #hidden_size and intermediate_size are both wrt all the attention heads. 
        #Should be divisible by num_attention_heads
        encoder_config = BertConfig(vocab_size=src_tokenizer.vocab_size,
                                    hidden_size=config.hidden_size,
                                    num_hidden_layers=config.num_hidden_layers,
                                    num_attention_heads=config.num_attention_heads,
                                    intermediate_size=config.intermediate_size,
                                    hidden_act=config.hidden_act,
                                    hidden_dropout_prob=config.dropout_prob,
                                    attention_probs_dropout_prob=config.dropout_prob,
                                    max_position_embeddings=512,
                                    type_vocab_size=2,
                                    initializer_range=0.02,
                                    layer_norm_eps=1e-12)

        decoder_config = BertConfig(vocab_size=tgt_tokenizer.vocab_size,
                                    hidden_size=config.hidden_size,
                                    num_hidden_layers=config.num_hidden_layers,
                                    num_attention_heads=config.num_attention_heads,
                                    intermediate_size=config.intermediate_size,
                                    hidden_act=config.hidden_act,
                                    hidden_dropout_prob=config.dropout_prob,
                                    attention_probs_dropout_prob=config.dropout_prob,
                                    max_position_embeddings=512,
                                    type_vocab_size=2,
                                    initializer_range=0.02,
                                    layer_norm_eps=1e-12,)

        #Create encoder and decoder embedding layers.
        encoder_embeddings = torch.nn.Embedding(src_tokenizer.vocab_size, config.hidden_size, padding_idx=src_tokenizer.pad_token_id)
        decoder_embeddings = torch.nn.Embedding(tgt_tokenizer.vocab_size, config.hidden_size, padding_idx=tgt_tokenizer.pad_token_id)

        encoder = BertModel(encoder_config)
        encoder.set_input_embeddings(encoder_embeddings.cuda())
        
        #decoder_config.add_cross_attention=True
        #decoder_config.is_decoder=True
        decoder = BertForMaskedLM(decoder_config)
        decoder.set_input_embeddings(decoder_embeddings.cuda())
        #Creating encoder and decoder with their respective embeddings.
        tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer})
        self.encoder = encoder
        self.decoder = decoder
        self.pad_sequence=PadSequence(tokenizers.src.pad_token_id, tokenizers.tgt.pad_token_id)
        self.tokenizers=tokenizers
        self.config=config
Example #5
0
def gen_model_loaders(config):
    model, tokenizers = M.build_model(config)
    pad_sequence = PadSequence(
        tokenizers.src.pad_token_id, tokenizers.tgt.pad_token_id)
    train_loader = DataLoader(IndicDataset(tokenizers.src, tokenizers.tgt, config.data, True),
                              batch_size=config.batch_size,
                              shuffle=False,
                              collate_fn=pad_sequence)
    eval_loader = DataLoader(IndicDataset(tokenizers.src, tokenizers.tgt, config.data, False),
                             batch_size=config.eval_size,
                             shuffle=False,
                             collate_fn=pad_sequence)
    return model, tokenizers, train_loader, eval_loader
def build_model(config):

    src_tokenizer = BertTokenizer.from_pretrained(
        'bert-base-multilingual-cased')
    tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    tgt_tokenizer.bos_token = '<s>'
    tgt_tokenizer.eos_token = '</s>'

    #hidden_size and intermediate_size are both wrt all the attention heads.
    #Should be divisible by num_attention_heads
    encoder_config = BertConfig(
        vocab_size=src_tokenizer.vocab_size,
        hidden_size=config.hidden_size,
        num_hidden_layers=config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        hidden_act=config.hidden_act,
        hidden_dropout_prob=config.dropout_prob,
        attention_probs_dropout_prob=config.dropout_prob,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12)

    decoder_config = BertConfig(
        vocab_size=tgt_tokenizer.vocab_size,
        hidden_size=config.hidden_size,
        num_hidden_layers=config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        hidden_act=config.hidden_act,
        hidden_dropout_prob=config.dropout_prob,
        attention_probs_dropout_prob=config.dropout_prob,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        is_decoder=False)

    #Create encoder and decoder embedding layers.
    encoder_embeddings = torch.nn.Embedding(
        src_tokenizer.vocab_size,
        config.hidden_size,
        padding_idx=src_tokenizer.pad_token_id)
    decoder_embeddings = torch.nn.Embedding(
        tgt_tokenizer.vocab_size,
        config.hidden_size,
        padding_idx=tgt_tokenizer.pad_token_id)

    encoder = BertModel(encoder_config)
    encoder.set_input_embeddings(encoder_embeddings)

    decoder = BertForMaskedLM(decoder_config)
    decoder.set_input_embeddings(decoder_embeddings)

    tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer})
    pad_sequence = PadSequence(tokenizers.src.pad_token_id,
                               tokenizers.tgt.pad_token_id)

    # model = TranslationModel(encoder, decoder)
    model = MyLightningModule(encoder, decoder, config, tokenizers,
                              pad_sequence)
    # model.cuda()

    return model, tokenizers
Example #7
0
 def prepare_data(self):
     self.pad_sequence = PadSequence(self.tokenizers.src.pad_token_id,
                                     self.tokenizers.tgt.pad_token_id)
Example #8
0
def gen_model_loaders(config):
    encoder, decoder, tokenizers = build_enc_dec_tokenizers(config)
    pad_sequence = PadSequence(tokenizers.src.pad_token_id,
                               tokenizers.tgt.pad_token_id)

    return encoder, decoder, tokenizers, pad_sequence