def init():
    global model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'checkpoint.pth')
    checkpoint = torch.load(model_path, map_location='cpu')

    args = checkpoint['args']
    hidden_dim = args.hidden_dim
    nheads = args.nheads
    enc_layers = args.enc_layers
    dec_layers = args.dec_layers
    dim_feedforward = args.dim_feedforward
    dropout = args.dropout
    # Build the models
    backbone_model = BackboneModel(hidden_dim=hidden_dim, arch=args.backbone)
    transformer_model = TransformerModel(d_model=hidden_dim,
                                         n_head=nheads,
                                         num_encoder_layers=enc_layers,
                                         num_decoder_layers=dec_layers,
                                         dim_feedforward=dim_feedforward,
                                         dropout=dropout,
                                         activation="relu",
                                         normalize_before=False)
    model = TraMapModel(backbone_model, transformer_model)
    backbone_model.to(device)
    transformer_model.to(device)
    model.to(device)

    model.load_state_dict(checkpoint['model'])

    model.eval()
    def __init__(self, args):
        # real Transformer model architecture
        self.transformer_model = TransformerModel(
            args=args,
            transformer_dropout=0.05,
            embedding_dropout=0.05,
            use_same_embedding=False,
        )
        self.args = args
        exp_name = args.data_set + '_' + args.exp_name

        # create experiment dir
        self.exp_dir = os.path.join(args.checkpoints_dir, exp_name)
        helper_fn.makedirs(self.exp_dir)
        hist_name = exp_name + '.hist'
        model_name = exp_name + '_final_model.h5'

        self.history_path = os.path.join(self.exp_dir, hist_name)
        self.model_path = os.path.join(self.exp_dir, model_name)

        outputs_dir = args.outputs_dir
        helper_fn.makedirs(outputs_dir)
        self.src_out_name = exp_name + '.src'
        self.src_out_path = os.path.join(outputs_dir, self.src_out_name)
        self.pred_out_name = exp_name + '.pred'
        self.pred_out_path = os.path.join(outputs_dir, self.pred_out_name)
        self.tar_out_name = exp_name + '.tgt'
        self.tar_out_path = os.path.join(outputs_dir, self.tar_out_name)
Beispiel #3
0
def train(**kwargs):
    print("loading dataset")
    train_dataset = NMTDataset(kwargs["src_train"], kwargs["tgt_train"])
    valid_dataset = NMTDataset(kwargs["src_valid"], kwargs["tgt_valid"])
    print("Dataset loaded successfully.")

    train_dl = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    valid_dl = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    tokenizer = SpaceTokenizer(base_path+"NMTtokenizers/spacetoken_vocab_files/vocab_nepali.json", 
                base_path+"NMTtokenizers/spacetoken_vocab_files/vocab_english.json"
                ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer(
                    base_path+"NMTtokenizers/wordpiece_vocab_files/vocab_newa.json", 
                    base_path+"NMTtokenizers/wordpiece_vocab_files/vocab_eng.json"
                )
    if kwargs['model'] == 'transformer':
        model = TransformerModel(len(tokenizer.src_vocab), len(tokenizer.tgt_vocab), embed_size, 
                n_heads, dropout=dropout_rate)
    else:
        model = Seq2Seq(embed_size, hidden_size, tokenizer, dropout_rate=dropout_rate, n_layers=n_layers)
    # criterion = nn.CrossEntropyLoss()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.0}
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=0.001)
    model.to(device)
    model  = trainer(model, optimizer, train_dl, valid_dl, BATCH_SIZE, epoch, 
                            device, LOG_EVERY, kwargs["checkpoint_path"], kwargs["best_model"], 
                            beam_size, max_decoding_time_step)
Beispiel #4
0
def decode(**kwargs):
    src_sent = open_file(kwargs['src_file'])
    tokenizer = SpaceTokenizer(
        src_vocab_path, tgt_vocab_path
    ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer(
        src_vocab_path, tgt_vocab_path)
    model = TransformerModel(len(tokenizer.src_vocab),
                             len(tokenizer.tgt_vocab),
                             tokenizer,
                             embed_size,
                             n_heads,
                             dropout=dropout_rate)

    model.to(device)
    model, _, _, _ = load_checkpt(model, kwargs['best_model'], device)
    src_tensor, _ = tokenizer.encode(src_sent, device, return_tensor=True)

    # translator = Translator(model, beam_size, max_decoding_time_step,
    #                     model.tokenizer.src_vocab['[PAD]'], model.tokenizer.tgt_vocab['[PAD]'],
    #                     model.tokenizer.tgt_vocab['[SOS]'], model.tokenizer.tgt_vocab['[EOS]']).to(device)
    output = []
    for src in src_tensor.transpose(0, 1):
        # pred_seq = translator.translate_sentence(src.view(1, -1), device)
        hyps = beam_search_transformer(model, src.view(1, -1), beam_size,
                                       max_decoding_time_step,
                                       model.tokenizer.src_vocab['[PAD]'],
                                       model.tokenizer.tgt_vocab['[EOS]'],
                                       device)
        # pred_seq = greedy_decode(model, src.view(1, -1), max_decoding_time_step, model.tokenizer.tgt_vocab['[SOS]'],
        #         model.tokenizer.src_vocab['[PAD]'], model.tokenizer.tgt_vocab['[EOS]'], device)
        top_hyp = hyps[0]
        hyp_sent = ' '.join(top_hyp.value)
        print(hyp_sent)
def transformer_model(c: Configs):
    from models.transformer import TransformerModel
    m = TransformerModel(n_tokens=c.n_tokens,
                         d_model=c.d_model,
                         encoder=c.transformer.encoder,
                         src_embed=c.transformer.src_embed)

    return m.to(c.device)
Beispiel #6
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     TransformerModel.add_args(parser)
     parser.add_argument('--share-encoder-embeddings',
                         action='store_true',
                         help='share encoder embeddings across languages')
     parser.add_argument('--share-decoder-embeddings',
                         action='store_true',
                         help='share decoder embeddings across languages')
     parser.add_argument('--share-encoders',
                         action='store_true',
                         help='share encoders across languages')
     parser.add_argument('--share-decoders',
                         action='store_true',
                         help='share decoders across languages')
Beispiel #7
0
    def build_model(cls, args, task):
        # set any default arguments
        transformer_align(args)

        transformer_model = TransformerModel.build_model(args, task)
        return TransformerAlignModel(transformer_model.encoder,
                                     transformer_model.decoder, args)
Beispiel #8
0
    def __init__(self,
                 ntoken,
                 d_model=512,
                 nhead=8,
                 nhid=512,
                 te_nlayers=6,
                 te_dropout=0.5,
                 pretrained_vec=None,
                 n_layers=2,
                 bidirectional=True,
                 output_dim=1,
                 hidden_dim=256,
                 rnn_dropout=0.3,
                 pad_token_id=None):
        super(TEGRU, self).__init__()

        self.transformer_encoder = TransformerModel(ntoken, d_model, nhead,
                                                    nhid, te_nlayers,
                                                    pretrained_vec, te_dropout)
        self.bidirectional = bidirectional
        self.n_layers = n_layers
        self.pad_token_id = pad_token_id
        self.rnn = nn.GRU(d_model,
                          hidden_size=hidden_dim,
                          num_layers=n_layers,
                          bidirectional=bidirectional,
                          dropout=0 if n_layers < 2 else rnn_dropout)
        rnn_input_dim = d_model if hidden_dim is None else hidden_dim
        self.fc = nn.Linear(
            2 * rnn_input_dim if bidirectional else rnn_input_dim, output_dim)
        nn.init.xavier_uniform_(self.fc.weight)
        self.dropout = nn.Dropout(rnn_dropout)
Beispiel #9
0
def train(**kwargs):
    print("loading dataset")
    train_dataset = NMTDataset(kwargs["src_train"], kwargs["tgt_train"])
    valid_dataset = NMTDataset(kwargs["src_valid"], kwargs["tgt_valid"])
    print("Dataset loaded successfully.")

    train_dl = DataLoader(train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          collate_fn=collate_fn)
    valid_dl = DataLoader(valid_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          collate_fn=collate_fn)
    tokenizer = SpaceTokenizer(
        src_vocab_path, tgt_vocab_path
    ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer(
        src_vocab_path, tgt_vocab_path)

    model = TransformerModel(len(tokenizer.src_vocab),
                             len(tokenizer.tgt_vocab),
                             tokenizer,
                             embed_size,
                             n_heads,
                             dropout=dropout_rate)
    model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=0.6,
                                 betas=(0.9, 0.98),
                                 eps=1e-9)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
    torch.autograd.set_detect_anomaly(True)

    train_model(model, optimizer, criterion, scheduler, train_dl, valid_dl,
                BATCH_SIZE, epoch, device, kwargs["checkpoint_path"],
                kwargs["best_model"], beam_size, max_decoding_time_step)
Beispiel #10
0
def test(**kwargs):
    test_dataset = NMTDataset(kwargs["src_test"], kwargs["tgt_test"])
    print("Dataset loaded successfully.")

    test_dl = DataLoader(test_dataset,
                         batch_size=BATCH_SIZE,
                         shuffle=True,
                         collate_fn=collate_fn)

    tokenizer = SpaceTokenizer(
        src_vocab_path, tgt_vocab_path
    ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer(
        src_vocab_path, tgt_vocab_path)
    model = TransformerModel(len(tokenizer.src_vocab),
                             len(tokenizer.tgt_vocab),
                             tokenizer,
                             embed_size,
                             n_heads,
                             dropout=dropout_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

    model.to(device)
    model.eval()
    bleu_score = 0
    test_loss = 0
    test_start_time = time.time()
    with torch.no_grad():
        for batch in test_dl:
            src_tensor, tgt_tensor, _, _ = model.tokenizer.encode(
                batch, device, return_tensor=True)
            src_tensor = src_tensor.transpose(0, 1)
            tgt_tensor = tgt_tensor.transpose(0, 1)
            trg_input = tgt_tensor[:, :-1]
            targets = tgt_tensor[:, 1:].contiguous().view(-1)
            preds = model(src_tensor, trg_input.to(device), device)

            loss = criterion(preds, targets)
            test_loss += loss.item() / BATCH_SIZE

            output = []
            for src in src_tensor:
                hyps = beam_search_transformer(
                    model, src.view(1, -1), beam_size, max_decoding_time_step,
                    model.tokenizer.src_vocab['[PAD]'],
                    model.tokenizer.tgt_vocab['[EOS]'], device)
                top_hyp = hyps[0]
                hyp_sent = ' '.join(top_hyp.value)
                output.append(hyp_sent)

            score = compute_bleu_score(output, batch[1])
            bleu_score += score
    print(
        f'Avg. test loss: {test_loss/len(test_dl):.5f} | BLEU Score: {bleu_score/len(test_dl)} | time elapsed: {time.time() - test_start_time}'
    )
Beispiel #11
0
 def __init__(self, cfg, weights_matrix=None):
     super(SentenceEncoder, self).__init__()
     vocab_size = cfg.get('vocab_size', {})
     pretrained_embeddings = cfg.get('pretrained_embeddings', False)
     if pretrained_embeddings:
         vocab = pickle.load(open('vocab/words.pkl', 'rb'))
         weights_matrix = pretrained_weights_matrix(
             vocab, cfg, vocab_size.get('sentences', None))
     self.embedding = create_emb_layer(cfg,
                                       vocab_size.get('sentences',
                                                      None), weights_matrix)
     self.encoder_type = cfg.get('encoder_type', 'lstm')
     if self.encoder_type not in ['lstm', 'transformer']:
         raise ValueError('Encoder needs to be valid type.')
     if self.encoder_type == 'lstm':
         self.encoder = LSTMModel(cfg)
     elif self.encoder_type == 'transformer':
         self.encoder = TransformerModel(cfg)
     self.device = torch.device(
         'cuda' if torch.cuda.is_available() else 'cpu')
     if not cfg.get('use_cuda', True):
         self.device = torch.device('cpu')
Beispiel #12
0
def train(
    run_name: str,
    # Data
    train_filepath: str = CSNJS_TRAIN_FILEPATH,
    eval_filepath: str = CSNJS_VALID_FILEPATH,
    spm_filepath: str = SPM_UNIGRAM_FILEPATH,
    program_mode="identity",
    eval_program_mode="identity",
    label_mode="identifier",
    num_workers=1,
    limit_dataset_size=-1,
    # Model
    model_type="transformer",
    n_decoder_layers=4,
    d_model: int = 512,
    resume_path: str = "",
    resume_encoder_name: str = "encoder_q",  # encoder_q, encoder_k, encoder
    resume_project: bool = False,
    # Optimization
    train_decoder_only: bool = False,
    num_epochs: int = 50,
    save_every: int = 2,
    batch_size: int = 256,
    lr: float = 8e-4,
    adam_beta1: float = 0.9,
    adam_beta2: float = 0.98,
    use_lr_warmup: bool = True,
    loss_type = "nll_token",  # nll_token or nll_sequence
    # Loss
    subword_regularization_alpha: float = 0,
    # Computational
    use_cuda: bool = True,
    auto_test: bool = True,
    seed: int = 0,
):
    """Train model"""
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    run_dir = RUN_DIR / run_name
    run_dir.mkdir(exist_ok=True, parents=True)
    logger.add(str((run_dir / "train.log").resolve()))
    logger.info(f"Saving logs, model checkpoints to {run_dir}")
    config = locals()
    logger.info(f"Config: {config}")
    wandb.init(name=run_name, config=config, job_type="training", project="identifier-prediction", entity="ml4code")

    if use_cuda:
        assert torch.cuda.is_available(), "CUDA not available. Check env configuration, or pass --use_cuda False"

    train_augmentations = [
        {"fn": "sample_lines", "line_length_pct": 0.5},
        {"fn": "insert_var_declaration", "prob": 0.5},
        {"fn": "rename_variable", "prob": 0.5},
    ]
    sp = spm.SentencePieceProcessor()
    sp.Load(spm_filepath)
    pad_id = sp.PieceToId("[PAD]")

    # Create training dataset and dataloader
    logger.info(f"Training data path {train_filepath}")
    train_dataset = get_csnjs_dataset(train_filepath, label_mode=label_mode, limit_size=limit_dataset_size)
    logger.info(f"Training dataset size: {len(train_dataset)}")
    train_loader = javascript_dataloader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        augmentations=train_augmentations,
        sp=sp,
        program_mode=program_mode,
        subword_regularization_alpha=subword_regularization_alpha,
    )

    # Create eval dataset and dataloader
    logger.info(f"Eval data path {eval_filepath}")
    eval_dataset = get_csnjs_dataset(eval_filepath, label_mode=label_mode, limit_size=limit_dataset_size)
    logger.info(f"Eval dataset size: {len(eval_dataset)}")
    eval_loader = javascript_dataloader(
        eval_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        augmentations=[],
        sp=sp,
        program_mode=eval_program_mode,
        subword_regularization_alpha=subword_regularization_alpha,
    )

    # Create model
    pad_id = sp.PieceToId("[PAD]")
    if model_type == "transformer":
        model = TransformerModel(n_tokens=sp.GetPieceSize(), pad_id=pad_id, n_decoder_layers=n_decoder_layers, d_model=d_model)
        logger.info(f"Created TransformerModel with {count_parameters(model)} params")
    elif model_type == "lstm":
        model = Seq2SeqLSTM(n_tokens=sp.GetPieceSize(), pad_id=pad_id, d_model=d_model)
        logger.info(f"Created Seq2SeqLSTM with {count_parameters(model)} params")

    # Load checkpoint
    if resume_path:
        logger.info(f"Resuming training from checkpoint {resume_path}, resume_encoder_name={resume_encoder_name}")
        checkpoint = torch.load(resume_path)
        pretrained_state_dict = checkpoint["model_state_dict"]
        encoder_state_dict = {}
        assert resume_encoder_name in ["encoder_k", "encoder_q", "encoder"]

        for key, value in pretrained_state_dict.items():
            if key.startswith(resume_encoder_name + ".") and "project_layer" not in key:
                remapped_key = key[len(resume_encoder_name + ".") :]
                logger.debug(f"Remapping checkpoint key {key} to {remapped_key}. Value mean: {value.mean().item()}")
                encoder_state_dict[remapped_key] = value
            if key.startswith(resume_encoder_name + ".") and "project_layer.0." in key and resume_project:
                remapped_key = key[len(resume_encoder_name + ".") :]
                logger.debug(f"Remapping checkpoint project key {key} to {remapped_key}. Value mean: {value.mean().item()}")
                encoder_state_dict[remapped_key] = value
        model.encoder.load_state_dict(encoder_state_dict, strict=False)
        logger.info(f"Loaded state dict from {resume_path}")
        logger.info(f"Loaded keys: {encoder_state_dict.keys()}")

    # Set up optimizer
    model = nn.DataParallel(model)
    model = model.cuda() if use_cuda else model
    wandb.watch(model, log="all")
    params = model.module.decoder.parameters() if train_decoder_only else model.parameters()
    optimizer = torch.optim.Adam(params, lr=lr, betas=(adam_beta1, adam_beta2), eps=1e-9)
    if use_lr_warmup:
        scheduler = get_linear_schedule_with_warmup(optimizer, 5000, len(train_loader) * num_epochs)
    else:
        scheduler = LambdaLR(optimizer, lr_lambda=lambda x: 1.0)

    global_step = 0
    min_eval_loss = float("inf")
    for epoch in tqdm.trange(1, num_epochs + 1, desc="training", unit="epoch", leave=False):
        logger.info(f"Starting epoch {epoch}\n")
        if train_decoder_only:
            model.module.encoder.eval()
            model.module.decoder.train()
        else:
            model.train()
        pbar = tqdm.tqdm(train_loader, desc=f"epoch {epoch}")
        for X, Y, X_lengths, Y_lengths in pbar:
            if use_cuda:
                X = X.cuda()
                Y = Y.cuda()
                X_lengths, Y_lengths = X_lengths.cuda(), Y_lengths.cuda()
            optimizer.zero_grad()
            # NOTE: X and Y are [B, max_seq_len] tensors (batch first)
            logits = model(X, Y[:, :-1], X_lengths, Y_lengths)
            if loss_type == "nll_sequence":
                loss = F.cross_entropy(logits.transpose(1, 2), Y[:, 1:], ignore_index=pad_id, reduction='sum')
                loss = loss / X.size(0)  # Average over num sequences, not target sequence lengths
                                        # Thus, minimize bits per sequence.
            elif loss_type == "nll_token":
                loss = F.cross_entropy(logits.transpose(1, 2), Y[:, 1:], ignore_index=pad_id,)
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Log loss
            global_step += 1
            wandb.log(
                {"epoch": epoch, f"label-{label_mode}/train_loss": loss.item(), "lr": scheduler.get_last_lr()[0]}, step=global_step
            )
            pbar.set_description(f"epoch {epoch} loss {loss.item():.4f}")

        # Evaluate
        logger.info(f"Evaluating model after epoch {epoch} ({global_step} steps)...")
        max_decode_len = 20 if label_mode == "identifier" else 200
        eval_loss = _evaluate(model, eval_loader, sp, use_cuda=use_cuda, max_decode_len=max_decode_len, loss_type=loss_type)
        logger.info(f"Evaluation loss after epoch {epoch} ({global_step} steps): {eval_loss:.4f}")
        wandb.log({"epoch": epoch, f"label-{label_mode}/eval_loss": eval_loss}, step=global_step)

        # Save checkpoint
        if save_every and epoch % save_every == 0 or eval_loss < min_eval_loss:
            checkpoint = {
                "model_state_dict": model.module.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "epoch": epoch,
                "global_step": global_step,
                "config": config,
                "eval_loss": eval_loss,
            }
            if eval_loss < min_eval_loss:
                logger.info(f"New best evaluation loss: prev {min_eval_loss:.4f} > new {eval_loss:.4f}")
                min_eval_loss = eval_loss
                model_file = run_dir / "ckpt_best.pth"
            else:
                model_file = run_dir / f"ckpt_ep{epoch:04d}.pth"
            logger.info(f"Saving checkpoint to {model_file}...")
            torch.save(checkpoint, str(model_file.resolve()))
            wandb.save(str(model_file.resolve()))
            logger.info("Done.")

    if auto_test:
        best_ckpt = run_dir / "ckpt_best.pth"
        test(
            str(best_ckpt.resolve()),
            CSNJS_TEST_FILEPATH,
            spm_filepath,
            program_mode,
            label_mode,
            num_workers,
            -1,
            n_decoder_layers=n_decoder_layers,
        )
Beispiel #13
0
def test(
    checkpoint_file: str,
    test_filepath: str = CSNJS_TEST_FILEPATH,
    spm_filepath: str = SPM_UNIGRAM_FILEPATH,
    program_mode="identity",
    label_mode="identifier",
    num_workers=1,
    limit_dataset_size=-1,
    batch_size=8,
    model_type="transformer",
    n_decoder_layers=4,
    d_model=512,
    use_cuda: bool = True,
):
    wandb.init(name=checkpoint_file, config=locals(), project="f1_eval", entity="ml4code")
    if use_cuda:
        assert torch.cuda.is_available(), "CUDA not available. Check env configuration, or pass --use_cuda False"
    sp = spm.SentencePieceProcessor()
    sp.Load(spm_filepath)

    # Create test dataset and dataloader
    logger.info(f"Test data path {test_filepath}")
    test_dataset = get_csnjs_dataset(test_filepath, label_mode=label_mode, limit_size=limit_dataset_size)
    logger.info(f"Test dataset size: {len(test_filepath)}")
    test_loader = javascript_dataloader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        sp=sp,
        program_mode=program_mode,
        subword_regularization_alpha=0,
        augmentations=[],
    )

    pad_id = sp.PieceToId("[PAD]")
    if model_type == "transformer":
        model = TransformerModel(n_tokens=sp.GetPieceSize(), pad_id=pad_id, n_decoder_layers=n_decoder_layers, d_model=d_model)
        logger.info(f"Created TransformerModel with {count_parameters(model)} params")
    elif model_type == "lstm":
        model = Seq2SeqLSTM(n_tokens=sp.GetPieceSize(), pad_id=pad_id, d_model=d_model)
        logger.info(f"Created Seq2SeqLSTM with {count_parameters(model)} params")
    if use_cuda:
        model = model.cuda()

    # Load checkpoint
    checkpoint = torch.load(checkpoint_file)
    pretrained_state_dict = checkpoint["model_state_dict"]
    print("CHECKPOINT", checkpoint_file)
    from pprint import pprint
    print("KEYS", checkpoint["model_state_dict"].keys())
    try:
        model.load_state_dict(pretrained_state_dict)
    except RuntimeError as e:
        logger.error(e)
        logger.error("Keys in checkpoint: " + str(list(pretrained_state_dict.keys())))
        raise e
    logger.info(f"Loaded state dict from {checkpoint_file}")

    # Evaluate NLL
    model.eval()
    with torch.no_grad():
        test_nll = calculate_nll(model, test_loader, sp, use_cuda=use_cuda, logger_fn=wandb.log)
    logger.info(f"NLL: {test_nll:.5f}")

    # Make metric
    metric = F1MetricMethodName()
    model.eval()
    with torch.no_grad():
        precision, recall, f1, sample_generations = calculate_f1_metric(metric, model, test_loader, sp, use_cuda=use_cuda, logger_fn=wandb.log)
    logger.info(f"NLL: {test_nll:.5f}")
    logger.info(f"Precision: {precision:.5f}%")
    logger.info(f"Recall: {recall:.5f}%")
    logger.info(f"F1: {f1:.5f}%")

    df_generations = pd.DataFrame(sample_generations)
    df_generations.to_pickle(os.path.join(wandb.run.dir, "sample_generations.pickle.gz"))
    wandb.save(os.path.join(wandb.run.dir, "sample_generations.pickle.gz"))
class Transformer:
    def __init__(self, args):
        # real Transformer model architecture
        self.transformer_model = TransformerModel(
            args=args,
            transformer_dropout=0.05,
            embedding_dropout=0.05,
            use_same_embedding=False,
        )
        self.args = args
        exp_name = args.data_set + '_' + args.exp_name

        # create experiment dir
        self.exp_dir = os.path.join(args.checkpoints_dir, exp_name)
        helper_fn.makedirs(self.exp_dir)
        hist_name = exp_name + '.hist'
        model_name = exp_name + '_final_model.h5'

        self.history_path = os.path.join(self.exp_dir, hist_name)
        self.model_path = os.path.join(self.exp_dir, model_name)

        outputs_dir = args.outputs_dir
        helper_fn.makedirs(outputs_dir)
        self.src_out_name = exp_name + '.src'
        self.src_out_path = os.path.join(outputs_dir, self.src_out_name)
        self.pred_out_name = exp_name + '.pred'
        self.pred_out_path = os.path.join(outputs_dir, self.pred_out_name)
        self.tar_out_name = exp_name + '.tgt'
        self.tar_out_path = os.path.join(outputs_dir, self.tar_out_name)

    def train(self):
        ds = DataSet(self.args)
        print('*' * 100)
        print('train sample number: ', ds.train_sample_num)
        print('valid sample number: ', ds.valid_sample_num)
        print('test sample number: ', ds.test_sample_num)
        print('*' * 100)

        train_generator = ds.data_generator(
            'train',
            'transformer',
            max_src_len=self.args.src_seq_length,
            max_tar_len=self.args.tar_seq_length,
        )

        valid_generator = ds.data_generator(
            'valid',
            'transformer',
            max_src_len=self.args.src_seq_length,
            max_tar_len=self.args.tar_seq_length,
        )

        def compile_new_model():
            _model = self.transformer_model.get_model(ds.pad_id)
            _model.compile(
                optimizer=keras.optimizers.Adam(lr=self.args.lr),
                loss=keras.losses.sparse_categorical_crossentropy,
            )
            return _model

        if os.path.exists(self.model_path):
            print('Loading model from: %s' % self.model_path)
            custom_dict = get_custom_objects()
            model = load_model(self.model_path, custom_objects=custom_dict)
        else:
            print('Compile new model...')
            model = compile_new_model()

        #model.summary()
        #plot_model(model, to_file='model_structure.png',show_shapes=True)

        verbose = 1
        earlystopper = EarlyStopping(monitor='val_loss',
                                     patience=self.args.early_stop_patience,
                                     verbose=verbose)
        ckpt_name = 'model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
        ckpt_path = os.path.join(self.exp_dir, ckpt_name)
        checkpoint = ModelCheckpoint(ckpt_path,
                                     monitor='val_loss',
                                     verbose=verbose,
                                     save_best_only=True,
                                     mode='min')
        lrate = keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=self.args.lr_decay_patience,
            verbose=verbose,
            mode='auto',
            min_delta=0.0001,
            cooldown=0,
            min_lr=self.args.lr_min,
        )

        callback_list = [earlystopper, checkpoint, lrate]

        hist = model.fit_generator(
            generator=train_generator,
            steps_per_epoch=(ds.train_sample_num // self.args.batch_size),
            epochs=self.args.epochs,
            callbacks=callback_list,
            validation_data=valid_generator,
            validation_steps=(ds.valid_sample_num // self.args.batch_size),
        )
        with open(self.history_path, 'w') as f:
            f.write(str(hist.history))

        model.save(self.model_path)
        #plot_model(model, to_file='model_structure.png',show_shapes=True)

    def test(self):
        # load_model
        print('Loading model from: %s' % self.model_path)
        custom_dict = get_custom_objects()
        model = load_model(self.model_path, custom_objects=custom_dict)

        ds = DataSet(args)
        test_generator = ds.data_generator(
            'test',
            'transformer',
            max_src_len=self.args.src_seq_length,
            max_tar_len=self.args.tar_seq_length,
        )

        src_outobj = open(self.src_out_path, 'w')
        pred_outobj = open(self.pred_out_path, 'w')
        tar_outobj = open(self.tar_out_path, 'w')

        for batch, ([src_input,
                     tar_input], tar_loss_input) in enumerate(test_generator):
            if batch > (ds.test_sample_num // self.args.batch_size):
                # finish all of the prediction
                break
            print('Current batch: {}/{}. '.format(
                batch, ds.test_sample_num // self.args.batch_size))
            cur_batch_size = tar_input.shape[0]
            tar_length = tar_input.shape[1]

            results = np.zeros_like(tar_input)
            results[:, 0] = ds.start_id
            for i in range(1, tar_length):
                results[:, i] = ds.pad_id

            for t in range(1, tar_length):
                preds = model.predict([
                    src_input, np.asarray(results)
                ])  # shape: (batch_size, tar_length, vocab_size)
                pred_id = np.argmax(preds, axis=-1)
                results[:, t] = pred_id[:, t - 1]

            def output_results(outputs, outobj):
                for result in outputs:
                    seq = []
                    for _id in result:
                        _id = int(_id)
                        if _id == ds.end_id:
                            break
                        if _id != ds.pad_id and _id != ds.start_id:
                            seq.append(
                                ds.tar_id_tokens.get(_id, config.UNK_TOKEN))
                    write_line = ' '.join(seq)
                    write_line = write_line + '\n'
                    outobj.write(write_line)

            output_results(results, pred_outobj)
            output_results(src_input, src_outobj)
            output_results(tar_input, tar_outobj)

        src_outobj.close()
        pred_outobj.close()
        tar_outobj.close()
Beispiel #15
0
def main(args):
    device = torch.device(args.device)

    # Seed
    seed = args.seed
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # Build the models
    backbone_model = BackboneModel(hidden_dim=args.hidden_dim,
                                   arch=args.backbone)
    transformer_model = TransformerModel(d_model=args.hidden_dim,
                                         n_head=args.nheads,
                                         num_encoder_layers=args.enc_layers,
                                         num_decoder_layers=args.dec_layers,
                                         dim_feedforward=args.dim_feedforward,
                                         dropout=args.dropout,
                                         activation="relu",
                                         normalize_before=False)
    model = TraMapModel(backbone_model, transformer_model)
    print("DEVICE:", device)
    backbone_model.to(device)
    transformer_model.to(device)
    model.to(device)

    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]

    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    # Data loader
    transforms = T.Compose([
        T.ToTensor(),
        T.Normalize(mean=[0.1888, 0.2168, 0.2469],
                    std=[0.3322, 0.2871, 0.2899])
    ])

    dataset_train = MapQueryDataset(transforms=transforms, split='train')
    sampler_train = torch.utils.data.RandomSampler(dataset_train)
    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=False)
    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   num_workers=args.num_workers)

    output_dir = Path(args.output_dir)
    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    if args.eval:
        test_stats = None

    # Criterion / Loss function
    # criterion = MSLELoss()
    # criterion = nn.MSELoss()
    # criterion = nn.L1Loss()
    criterion = nn.SmoothL1Loss()
    criterion.to(device)

    # Logger thing
    MB = 1024.0 * 1024.0
    print_every = 10

    target = data_loader_train
    print("Start Training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        model.train()
        criterion.train()
        print("EPOCH:", epoch)

        i = 0
        ## Training process ##
        # Move to GPU or CPU
        for sample, query, duration in data_returner(data_loader_train):

            query = query.to(device)
            sample = sample.to(device)
            ## Target duration
            duration = duration.to(device)
            duration = duration.float()
            outputs = model(sample, query)
            outputs = outputs.flatten()
            # RMSE if criterion set to MSE
            # loss = torch.sqrt(criterion(outputs, duration) + 1e-8)
            # Else
            loss = criterion(outputs, duration)

            loss_value = loss.item()
            if not math.isfinite(loss_value):
                print(
                    "Loss is {}, stop the training process".format(loss_value))
                sys.exit(1)

            optimizer.zero_grad()
            loss.backward()
            if args.clip_max_norm > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.clip_max_norm)

            optimizer.step()

            if i % print_every == 0:
                # print("Output: {} Target: {}".format(outputs.tolist()[0], duration.tolist()[0]))
                if torch.cuda.is_available():
                    print("Iter: {} Memory: {:d}MB Loss: {}".format(
                        i, math.trunc(torch.cuda.max_memory_allocated() / MB),
                        loss_value))
                    # print(outputs[0].item(), duration[0].item())
                else:
                    print("Iter: {} Loss:{}".format(i, loss_value))
            i += 1
        lr_scheduler.step()
        ## Saving or Not saving, there is no in between
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                torch.save(
                    {
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Beispiel #16
0
# Config dict and model is for reference
config_dict = LoadConfig('conf').load_config()

# Load Data
dataset_name = config_dict['dataset_name']
data_creator = CreateData(config_path='conf')
train_datasets, valid_datasets, test_datasets = data_creator.create_all()

# Define Model
model = TransformerModel(
    encoder_vocab_size=data_creator.tokenizer.lang_one_vocab_size,
    decoder_vocab_size=data_creator.tokenizer.lang_two_vocab_size,
    encoder_max_pos=config_dict['max_pos_length'],
    decoder_max_pos=config_dict['max_pos_length'],
    num_heads=config_dict['num_heads'],
    model_dim=config_dict['model_dim'],
    feed_forward_dim=config_dict['feed_forward_dim'],
    dropout_rate=config_dict['dropout_rate'],
    mha_concat_query=config_dict['mha_concat_query'],
    n_layers=config_dict['n_layers'],
    debug=config_dict['debug'])

# Learning Rate Schedule
model_learning_rate = CustomSchedule(config_dict['model_dim'])
model_optimizer = tf.keras.optimizers.Adam(learning_rate=model_learning_rate,
                                           beta_1=0.9,
                                           beta_2=0.98,
                                           epsilon=1e-9)

# Loss Object
#  If reduction is NONE, this has shape [batch_size, d0, .. dN-1];