Beispiel #1
0
def build_model(config):

    src_tokenizer = BertTokenizer.from_pretrained(
        'bert-base-multilingual-cased')
    tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    tgt_tokenizer.bos_token = '<s>'
    tgt_tokenizer.eos_token = '</s>'

    #hidden_size and intermediate_size are both wrt all the attention heads.
    #Should be divisible by num_attention_heads
    encoder_config = BertConfig(
        vocab_size=src_tokenizer.vocab_size,
        hidden_size=config.hidden_size,
        num_hidden_layers=config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        hidden_act=config.hidden_act,
        hidden_dropout_prob=config.dropout_prob,
        attention_probs_dropout_prob=config.dropout_prob,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12)

    decoder_config = BertConfig(
        vocab_size=tgt_tokenizer.vocab_size,
        hidden_size=config.hidden_size,
        num_hidden_layers=config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        hidden_act=config.hidden_act,
        hidden_dropout_prob=config.dropout_prob,
        attention_probs_dropout_prob=config.dropout_prob,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        is_decoder=True)

    #Create encoder and decoder embedding layers.
    encoder_embeddings = torch.nn.Embedding(
        src_tokenizer.vocab_size,
        config.hidden_size,
        padding_idx=src_tokenizer.pad_token_id)
    decoder_embeddings = torch.nn.Embedding(
        tgt_tokenizer.vocab_size,
        config.hidden_size,
        padding_idx=tgt_tokenizer.pad_token_id)

    encoder = BertModel(encoder_config)
    encoder.set_input_embeddings(encoder_embeddings.cuda())

    decoder = BertForMaskedLM(decoder_config)
    decoder.set_input_embeddings(decoder_embeddings.cuda())

    tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer})

    return encoder, decoder, tokenizers
Beispiel #2
0
    def __init__(self,config):

        super().__init__() 
        src_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        tgt_tokenizer.bos_token = '<s>'
        tgt_tokenizer.eos_token = '</s>'
        #hidden_size and intermediate_size are both wrt all the attention heads. 
        #Should be divisible by num_attention_heads
        encoder_config = BertConfig(vocab_size=src_tokenizer.vocab_size,
                                    hidden_size=config.hidden_size,
                                    num_hidden_layers=config.num_hidden_layers,
                                    num_attention_heads=config.num_attention_heads,
                                    intermediate_size=config.intermediate_size,
                                    hidden_act=config.hidden_act,
                                    hidden_dropout_prob=config.dropout_prob,
                                    attention_probs_dropout_prob=config.dropout_prob,
                                    max_position_embeddings=512,
                                    type_vocab_size=2,
                                    initializer_range=0.02,
                                    layer_norm_eps=1e-12)

        decoder_config = BertConfig(vocab_size=tgt_tokenizer.vocab_size,
                                    hidden_size=config.hidden_size,
                                    num_hidden_layers=config.num_hidden_layers,
                                    num_attention_heads=config.num_attention_heads,
                                    intermediate_size=config.intermediate_size,
                                    hidden_act=config.hidden_act,
                                    hidden_dropout_prob=config.dropout_prob,
                                    attention_probs_dropout_prob=config.dropout_prob,
                                    max_position_embeddings=512,
                                    type_vocab_size=2,
                                    initializer_range=0.02,
                                    layer_norm_eps=1e-12,)

        #Create encoder and decoder embedding layers.
        encoder_embeddings = torch.nn.Embedding(src_tokenizer.vocab_size, config.hidden_size, padding_idx=src_tokenizer.pad_token_id)
        decoder_embeddings = torch.nn.Embedding(tgt_tokenizer.vocab_size, config.hidden_size, padding_idx=tgt_tokenizer.pad_token_id)

        encoder = BertModel(encoder_config)
        encoder.set_input_embeddings(encoder_embeddings.cuda())
        
        #decoder_config.add_cross_attention=True
        #decoder_config.is_decoder=True
        decoder = BertForMaskedLM(decoder_config)
        decoder.set_input_embeddings(decoder_embeddings.cuda())
        #Creating encoder and decoder with their respective embeddings.
        tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer})
        self.encoder = encoder
        self.decoder = decoder
        self.pad_sequence=PadSequence(tokenizers.src.pad_token_id, tokenizers.tgt.pad_token_id)
        self.tokenizers=tokenizers
        self.config=config
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        description='Train the individual Transformer model')
    parser.add_argument('--dataset_folder', type=str, default='datasets')
    parser.add_argument('--dataset_name', type=str, default='zara1')
    parser.add_argument('--obs', type=int, default=8)
    parser.add_argument('--preds', type=int, default=12)
    parser.add_argument('--emb_size', type=int, default=1024)
    parser.add_argument('--heads', type=int, default=8)
    parser.add_argument('--layers', type=int, default=6)
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--cpu', action='store_true')
    parser.add_argument('--output_folder', type=str, default='Output')
    parser.add_argument('--val_size', type=int, default=50)
    parser.add_argument('--gpu_device', type=str, default="0")
    parser.add_argument('--verbose', action='store_true')
    parser.add_argument('--max_epoch', type=int, default=100)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--validation_epoch_start', type=int, default=30)
    parser.add_argument('--resume_train', action='store_true')
    parser.add_argument('--delim', type=str, default='\t')
    parser.add_argument('--name', type=str, default="zara1")

    args = parser.parse_args()
    model_name = args.name

    try:
        os.mkdir('models')
    except:
        pass
    try:
        os.mkdir('output')
    except:
        pass
    try:
        os.mkdir('output/BERT')
    except:
        pass
    try:
        os.mkdir(f'models/BERT')
    except:
        pass

    try:
        os.mkdir(f'output/BERT/{args.name}')
    except:
        pass

    try:
        os.mkdir(f'models/BERT/{args.name}')
    except:
        pass

    log = SummaryWriter('logs/BERT_%s' % model_name)

    log.add_scalar('eval/mad', 0, 0)
    log.add_scalar('eval/fad', 0, 0)

    try:
        os.mkdir(args.name)
    except:
        pass

    device = torch.device("cuda")
    if args.cpu or not torch.cuda.is_available():
        device = torch.device("cpu")

    args.verbose = True

    ## creation of the dataloaders for train and validation
    train_dataset, _ = baselineUtils.create_dataset(args.dataset_folder,
                                                    args.dataset_name,
                                                    0,
                                                    args.obs,
                                                    args.preds,
                                                    delim=args.delim,
                                                    train=True,
                                                    verbose=args.verbose)
    val_dataset, _ = baselineUtils.create_dataset(args.dataset_folder,
                                                  args.dataset_name,
                                                  0,
                                                  args.obs,
                                                  args.preds,
                                                  delim=args.delim,
                                                  train=False,
                                                  verbose=args.verbose)
    test_dataset, _ = baselineUtils.create_dataset(args.dataset_folder,
                                                   args.dataset_name,
                                                   0,
                                                   args.obs,
                                                   args.preds,
                                                   delim=args.delim,
                                                   train=False,
                                                   eval=True,
                                                   verbose=args.verbose)

    from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertConfig, AdamW

    config = BertConfig(vocab_size=30522,
                        hidden_size=768,
                        num_hidden_layers=12,
                        num_attention_heads=12,
                        intermediate_size=3072,
                        hidden_act='relu',
                        hidden_dropout_prob=0.1,
                        attention_probs_dropout_prob=0.1,
                        max_position_embeddings=512,
                        type_vocab_size=2,
                        initializer_range=0.02,
                        layer_norm_eps=1e-12)
    model = BertModel(config).to(device)

    from individual_TF import LinearEmbedding as NewEmbed, Generator as GeneratorTS
    a = NewEmbed(3, 768).to(device)
    model.set_input_embeddings(a)
    generator = GeneratorTS(768, 2).to(device)
    #model.set_output_embeddings(GeneratorTS(1024,2))

    tr_dl = torch.utils.data.DataLoader(train_dataset,
                                        batch_size=args.batch_size,
                                        shuffle=True,
                                        num_workers=0)
    val_dl = torch.utils.data.DataLoader(val_dataset,
                                         batch_size=args.batch_size,
                                         shuffle=True,
                                         num_workers=0)
    test_dl = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          num_workers=0)

    #optim = SGD(list(a.parameters())+list(model.parameters())+list(generator.parameters()),lr=0.01)
    #sched=torch.optim.lr_scheduler.StepLR(optim,0.0005)
    optim = NoamOpt(
        768, 0.1, len(tr_dl),
        torch.optim.Adam(list(a.parameters()) + list(model.parameters()) +
                         list(generator.parameters()),
                         lr=0,
                         betas=(0.9, 0.98),
                         eps=1e-9))
    #optim=Adagrad(list(a.parameters())+list(model.parameters())+list(generator.parameters()),lr=0.01,lr_decay=0.001)
    epoch = 0

    mean = train_dataset[:]['src'][:, :, 2:4].mean((0, 1)) * 0
    std = train_dataset[:]['src'][:, :, 2:4].std((0, 1)) * 0 + 1

    while epoch < args.max_epoch:
        epoch_loss = 0
        model.train()

        for id_b, batch in enumerate(tr_dl):

            optim.optimizer.zero_grad()
            r = 0
            rot_mat = np.array([[np.cos(r), np.sin(r)],
                                [-np.sin(r), np.cos(r)]])

            inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device)
            inp = torch.matmul(inp,
                               torch.from_numpy(rot_mat).float().to(device))
            trg_masked = torch.zeros((inp.shape[0], args.preds, 2)).to(device)
            inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device)
            trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1],
                                  1).to(device)
            inp_cat = torch.cat((inp, trg_masked), 1)
            cls_cat = torch.cat((inp_cls, trg_cls), 1)
            net_input = torch.cat((inp_cat, cls_cat), 2)

            position = torch.arange(0, net_input.shape[1]).repeat(
                inp.shape[0], 1).long().to(device)
            token = torch.zeros(
                (inp.shape[0], net_input.shape[1])).long().to(device)
            attention_mask = torch.ones(
                (inp.shape[0], net_input.shape[1])).long().to(device)

            out = model(input_ids=net_input,
                        position_ids=position,
                        token_type_ids=token,
                        attention_mask=attention_mask)

            pred = generator(out[0])

            loss = F.pairwise_distance(
                pred[:, :].contiguous().view(-1, 2),
                torch.matmul(
                    torch.cat(
                        (batch['src'][:, :, 2:4], batch['trg'][:, :, 2:4]),
                        1).contiguous().view(-1, 2).to(device),
                    torch.from_numpy(rot_mat).float().to(device))).mean()
            loss.backward()
            optim.step()
            print("epoch %03i/%03i  frame %04i / %04i loss: %7.4f" %
                  (epoch, args.max_epoch, id_b, len(tr_dl), loss.item()))
            epoch_loss += loss.item()
        #sched.step()
        log.add_scalar('Loss/train', epoch_loss / len(tr_dl), epoch)
        with torch.no_grad():
            model.eval()

            gt = []
            pr = []
            val_loss = 0
            for batch in val_dl:
                inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device)
                trg_masked = torch.zeros(
                    (inp.shape[0], args.preds, 2)).to(device)
                inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device)
                trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1],
                                      1).to(device)
                inp_cat = torch.cat((inp, trg_masked), 1)
                cls_cat = torch.cat((inp_cls, trg_cls), 1)
                net_input = torch.cat((inp_cat, cls_cat), 2)

                position = torch.arange(0, net_input.shape[1]).repeat(
                    inp.shape[0], 1).long().to(device)
                token = torch.zeros(
                    (inp.shape[0], net_input.shape[1])).long().to(device)
                attention_mask = torch.zeros(
                    (inp.shape[0], net_input.shape[1])).long().to(device)

                out = model(input_ids=net_input,
                            position_ids=position,
                            token_type_ids=token,
                            attention_mask=attention_mask)

                pred = generator(out[0])

                loss = F.pairwise_distance(
                    pred[:, :].contiguous().view(-1, 2),
                    torch.cat(
                        (batch['src'][:, :, 2:4], batch['trg'][:, :, 2:4]),
                        1).contiguous().view(-1, 2).to(device)).mean()
                val_loss += loss.item()

                gt_b = batch['trg'][:, :, 0:2]
                preds_tr_b = pred[:, args.obs:].cumsum(1).to(
                    'cpu').detach() + batch['src'][:, -1:, 0:2]
                gt.append(gt_b)
                pr.append(preds_tr_b)

            gt = np.concatenate(gt, 0)
            pr = np.concatenate(pr, 0)
            mad, fad, errs = baselineUtils.distance_metrics(gt, pr)
            log.add_scalar('validation/loss', val_loss / len(val_dl), epoch)
            log.add_scalar('validation/mad', mad, epoch)
            log.add_scalar('validation/fad', fad, epoch)

            model.eval()

            gt = []
            pr = []
            for batch in test_dl:
                inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device)
                trg_masked = torch.zeros(
                    (inp.shape[0], args.preds, 2)).to(device)
                inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device)
                trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1],
                                      1).to(device)
                inp_cat = torch.cat((inp, trg_masked), 1)
                cls_cat = torch.cat((inp_cls, trg_cls), 1)
                net_input = torch.cat((inp_cat, cls_cat), 2)

                position = torch.arange(0, net_input.shape[1]).repeat(
                    inp.shape[0], 1).long().to(device)
                token = torch.zeros(
                    (inp.shape[0], net_input.shape[1])).long().to(device)
                attention_mask = torch.zeros(
                    (inp.shape[0], net_input.shape[1])).long().to(device)

                out = model(input_ids=net_input,
                            position_ids=position,
                            token_type_ids=token,
                            attention_mask=attention_mask)

                pred = generator(out[0])

                gt_b = batch['trg'][:, :, 0:2]
                preds_tr_b = pred[:, args.obs:].cumsum(1).to(
                    'cpu').detach() + batch['src'][:, -1:, 0:2]
                gt.append(gt_b)
                pr.append(preds_tr_b)

            gt = np.concatenate(gt, 0)
            pr = np.concatenate(pr, 0)
            mad, fad, errs = baselineUtils.distance_metrics(gt, pr)

            torch.save(model.state_dict(),
                       "models/BERT/%s/ep_%03i.pth" % (args.name, epoch))
            torch.save(generator.state_dict(),
                       "models/BERT/%s/gen_%03i.pth" % (args.name, epoch))
            torch.save(a.state_dict(),
                       "models/BERT/%s/emb_%03i.pth" % (args.name, epoch))

            log.add_scalar('eval/mad', mad, epoch)
            log.add_scalar('eval/fad', fad, epoch)

        epoch += 1

    ab = 1
Beispiel #4
0
    def __init__(self):
        self.src_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.tgt_tokenizer.bos_token = '<s>'
        self.tgt_tokenizer.eos_token = '</s>'

        #hidden_size and intermediate_size are both wrt all the attention heads. 
        #Should be divisible by num_attention_heads
        encoder_config = BertConfig(vocab_size=self.src_tokenizer.vocab_size,
                                    hidden_size=config.hidden_size,
                                    num_hidden_layers=config.num_hidden_layers,
                                    num_attention_heads=config.num_attention_heads,
                                    intermediate_size=config.intermediate_size,
                                    hidden_act=config.hidden_act,
                                    hidden_dropout_prob=config.dropout_prob,
                                    attention_probs_dropout_prob=config.dropout_prob,
                                    max_position_embeddings=512,
                                    type_vocab_size=2,
                                    initializer_range=0.02,
                                    layer_norm_eps=1e-12)

        decoder_config = BertConfig(vocab_size=self.tgt_tokenizer.vocab_size,
                                    hidden_size=config.hidden_size,
                                    num_hidden_layers=config.num_hidden_layers,
                                    num_attention_heads=config.num_attention_heads,
                                    intermediate_size=config.intermediate_size,
                                    hidden_act=config.hidden_act,
                                    hidden_dropout_prob=config.dropout_prob,
                                    attention_probs_dropout_prob=config.dropout_prob,
                                    max_position_embeddings=512,
                                    type_vocab_size=2,
                                    initializer_range=0.02,
                                    layer_norm_eps=1e-12,
                                    is_decoder=True)

        #Create encoder and decoder embedding layers.
        encoder_embeddings = torch.nn.Embedding(self.src_tokenizer.vocab_size, config.hidden_size, padding_idx=self.src_tokenizer.pad_token_id)
        decoder_embeddings = torch.nn.Embedding(self.tgt_tokenizer.vocab_size, config.hidden_size, padding_idx=self.tgt_tokenizer.pad_token_id)

        encoder = BertModel(encoder_config)
        encoder.set_input_embeddings(encoder_embeddings.cpu())

        decoder = BertForMaskedLM(decoder_config)
        decoder.set_input_embeddings(decoder_embeddings.cpu())

        input_dirs = config.model_output_dirs

        suffix = "pytorch_model.bin"
        decoderPath = os.path.join(input_dirs['decoder'], suffix)
        encoderPath = os.path.join(input_dirs['encoder'], suffix)

        decoder_state_dict = torch.load(decoderPath)
        encoder_state_dict = torch.load(encoderPath)
        decoder.load_state_dict(decoder_state_dict)
        encoder.load_state_dict(encoder_state_dict)
        self.model = TranslationModel(encoder, decoder, None, None, self.tgt_tokenizer, config)
        self.model.cpu()


        #model.eval()
        self.model.encoder.eval()
        self.model.decoder.eval()
Beispiel #5
0
def build_model(config, train_loader, eval_loader):
    
    src_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    tgt_tokenizer.bos_token = '<s>'
    tgt_tokenizer.eos_token = '</s>'
    
    #hidden_size and intermediate_size are both wrt all the attention heads. 
    #Should be divisible by num_attention_heads
    encoder_config = BertConfig(vocab_size=src_tokenizer.vocab_size,
                                hidden_size=config.hidden_size,
                                num_hidden_layers=config.num_hidden_layers,
                                num_attention_heads=config.num_attention_heads,
                                intermediate_size=config.intermediate_size,
                                hidden_act=config.hidden_act,
                                hidden_dropout_prob=config.dropout_prob,
                                attention_probs_dropout_prob=config.dropout_prob,
                                max_position_embeddings=512,
                                type_vocab_size=2,
                                initializer_range=0.02,
                                layer_norm_eps=1e-12)

    decoder_config = BertConfig(vocab_size=tgt_tokenizer.vocab_size,
                                hidden_size=config.hidden_size,
                                num_hidden_layers=config.num_hidden_layers,
                                num_attention_heads=config.num_attention_heads,
                                intermediate_size=config.intermediate_size,
                                hidden_act=config.hidden_act,
                                hidden_dropout_prob=config.dropout_prob,
                                attention_probs_dropout_prob=config.dropout_prob,
                                max_position_embeddings=512,
                                type_vocab_size=2,
                                initializer_range=0.02,
                                layer_norm_eps=1e-12,
                                is_decoder=True)

    #Create encoder and decoder embedding layers.
    encoder_embeddings = torch.nn.Embedding(src_tokenizer.vocab_size, config.hidden_size, padding_idx=src_tokenizer.pad_token_id)
    decoder_embeddings = torch.nn.Embedding(tgt_tokenizer.vocab_size, config.hidden_size, padding_idx=tgt_tokenizer.pad_token_id)

    encoder = BertModel(encoder_config)
    encoder.set_input_embeddings(encoder_embeddings.cpu())
    
    decoder = BertForMaskedLM(decoder_config)
    decoder.set_input_embeddings(decoder_embeddings.cpu())
    """
    input_dirs = config.model_output_dirs
    if(os.listdir(input_dirs['decoder']) and os.listdir(input_dirs['encoder'])):
        suffix = "pytorch_model.bin"
        decoderPath = os.path.join(input_dirs['decoder'], suffix)
        encoderPath = os.path.join(input_dirs['encoder'], suffix)
        
        decoder_state_dict = torch.load(decoderPath)
        encoder_state_dict = torch.load(encoderPath)
        decoder.load_state_dict(decoder_state_dict)
        encoder.load_state_dict(encoder_state_dict)
        model = TranslationModel(encoder, decoder, train_loader, eval_loader, tgt_tokenizer, config)
        model.cpu()
        return model
    """
    #model = TranslationModel(encoder, decoder)
    model = TranslationModel(encoder, decoder, train_loader, eval_loader, tgt_tokenizer, config)
    model.cpu()


    tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer})
    #return model, tokenizers
    return model