Beispiel #1
0
    def test_early_stop_histories_are_unequal(self):
        """ Should return False, because both EarlyStopHistories contain unequal indices. """
        net = Linear(1, 1)
        history1, history2 = EarlyStopHistory(), EarlyStopHistory()
        history1.setup(1)
        history2.setup(1)
        history1.state_dicts[0], history2.state_dicts[0] = net.state_dict(
        ), net.state_dict()
        history1.indices[0], history2.indices[0] = 3, 42

        self.assertIs(EarlyStopHistory.__eq__(history1, history2), False)
def train(data_loader):
    net = Linear(3, 1)
    lr = 1e-3
    optimizer = AdamW(net.parameters(), lr)
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2)
    max_epochs = 40
    for ep_id in range(max_epochs):
        net.train()
        for b_id, batch in enumerate(data_loader):
            optimizer.zero_grad()
            output = net(batch['data'].to(data_loader))
            CEloss = CrossEntropyLoss()
            loss = CEloss(output, batch['ground_truth'])
            writer = SummaryWriter(
                log_dir=
                'C:\Users\andre\OneDrive\Рабочий стол\train\checkpoints',
                comment="Batch loss")
            writer.add_graph(loss)
            loss.backward()
            optimiser.step()
        scheduler.step()
        torch.save(
            {
                'epoch': ep_id,
                'model_state_dict': net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
            }, '/checkpoints/checkpoints.txt')
def test_pretrainer(tmpdir, device):
    import torch
    from torch.nn import Linear

    # save a model in tmpdir/original/model.ckpt
    first_model = Linear(32, 32).to(device)
    pretrained_dir = tmpdir / "original"
    pretrained_dir.mkdir()
    with open(pretrained_dir / "model.ckpt", "wb") as fo:
        torch.save(first_model.state_dict(), fo)

    # Make a new model and Pretrainer
    pretrained_model = Linear(32, 32).to(device)
    assert not torch.all(torch.eq(pretrained_model.weight, first_model.weight))
    from speechbrain.utils.parameter_transfer import Pretrainer

    pt = Pretrainer(collect_in=tmpdir / "reused",
                    loadables={"model": pretrained_model})
    pt.collect_files(default_source=pretrained_dir)
    pt.load_collected()
    assert torch.all(torch.eq(pretrained_model.weight, first_model.weight))
Beispiel #4
0
 def from_module(cls, src: nn.Linear) -> 'Linear':
     with_bias = src.bias is not None
     new_lin = Linear(src.in_features, src.out_features, with_bias)
     new_lin.load_state_dict(src.state_dict())
     return new_lin
        maint_logits = torch.stack([torch.sum(maint_logits_daily[slice]) \
                        for slice in maint_hist_slices])
        logits = mission_logits + maint_logits

        bce = loss_fun(logits, y)
        bce = loss_fun(mission_logits, y)
        train_losses.append(bce.cpu().detach().numpy())

    n_seen += 1
    time_elapsed += dt.now() - start_time

    update_str = 'Elapsed Time: {0}, Batches Seen: {1}/{2}, Avg. Loss: {3:.4f}'
    elapsed_time_str = ''.join(str(time_elapsed).split('.')[0])
    format_args = (elapsed_time_str, n_seen, len(train_loader),
                   np.mean(train_losses[-5:]))
    print(update_str.format(*format_args))

    if not n_seen % val_every:
        val_acc = evaluate(mission_model, maint_model, val_loader, device)
        val_accs.append(val_acc)

    if not n_seen % save_every:
        ckpt = dict(mission_state_dict=mission_model.state_dict(),
                    maint_state_dict=maint_model.state_dict(),
                    train_losses=train_losses,
                    val_accs=val_accs,
                    time_elapsed=time_elapsed)
        torch.save(ckpt, save_path)

print('\nTraining complete!\n')
"""
A Simple Linear model using Linear module in pytorch
"""
import torch
from torch.nn import Linear
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# set random seed
torch.manual_seed(1)

lr = Linear(in_features=2, out_features=1, bias=True)
print("Parameter w and b:", list(lr.parameters()))

# A method state_dict() Returns a Python dictionary
# object corresponding to the layers of each parameter tensor

print("Layers of each parameter:", lr.state_dict())

# make prediction now for the linear model
x = torch.tensor([[1.0, 2.0], [2.0, 2.0]])
yhat = lr(x)
print("The prediction:", yhat)
print(x.data)
Beispiel #7
0
class LR(nn.Module):  #LR is defined as a child class of nn.Module
    def __init__(self, in_size, out_size):  #Constructor
        #Inheriting methods from parent class nn.Module
        super(LR, self).__init__()
        self.linear = nn.Linear(in_size, out_size)

    def forward(self, x):  #Prediction funciton
        out = self.linear(x)
        return out


model = LR(1, 1)

#We use state_dict to initialize the weight and bias of the model.
#State_dict is a python dictionary. It can be used to map the relationship of linear layers to their parameters.
model.state_dict()['linear.weight'].data[0] = torch.tensor(
    [0.5153])  #Initializing weight
model.state_dict()['linear.bias'].data[0] = torch.tensor(
    [-0.4414])  #initializing bias
print("The Parameters: ",
      list(model.parameters()))  #Lets see the initialized weight and bias
#The above function works because the class LR, is a subclass of Modules.

#Lets use the custom model to make predictions
x = torch.tensor([[1.0], [2.0]])  #2D tensor
print("The shape of x: ", x.shape)
yhat = model(x)
print(yhat)

x = torch.tensor([[1.0], [2.0], [3.0]])
print("The shape of x: ", x.shape)
Beispiel #8
0
def forward(x):
    yhat = w * x + b
    return yhat


x = torch.tensor([[1.0], [2.0], [3.0]])
yhat = forward(x)
print("The Prediction: ", yhat)
print("Y size", yhat.shape)

torch.manual_seed(1)

lr = Linear(in_features=1, out_features=1, bias=True)
print("Parameters w and b: ", list(lr.parameters()))

print("Python Dictionary", lr.state_dict())
print("keys:", lr.state_dict().keys())
print("values:", lr.state_dict().values())

print("weight:", lr.weight)
print("bias:", lr.bias)

x = torch.tensor([[1.0]])
yhat = lr(x)
print("The prediction: ", yhat)

x = torch.tensor([[1.0], [2.0]])
yhat = lr(x)
print("The prediction: ", yhat)

x = torch.tensor([[1.0], [2.0], [3.0]])
Beispiel #9
0
def main():
    cmd_args = add_argument()

    path_to_file_tr = cmd_args.path_to_file_tr
    path_to_file_ts = cmd_args.path_to_file_ts

    min_len_mol = cmd_args.min_len_mol
    max_len_mol = cmd_args.max_len_mol

    num_examples_tr = cmd_args.num_examples_tr
    num_examples_ts = cmd_args.num_examples_ts

    train_batch_size = json.load(open(cmd_args.ds_conf))['train_batch_size']
    gradient_accumulation_steps = json.load(open(
        cmd_args.ds_conf))['gradient_accumulation_steps']

    deepspeed_optimizer = True if json.load(open(cmd_args.ds_conf)).get(
        'optimizer', None) is not None else False

    epochs = cmd_args.epochs
    emb_dim = cmd_args.emb_dim
    dim = cmd_args.dim
    bucket_size = cmd_args.bucket_size
    depth = cmd_args.depth
    heads = cmd_args.heads
    n_hashes = cmd_args.n_hashes
    ff_chunks = cmd_args.ff_chunks
    attn_chunks = cmd_args.attn_chunks
    validate_every = cmd_args.validate_every
    save_every = cmd_args.save_every
    output_folder = cmd_args.output_folder

    use_full_attn = cmd_args.use_full_attn
    mrpc_test = cmd_args.mrpc_test
    use_deepspeed = cmd_args.use_deepspeed

    os.makedirs(output_folder, exist_ok=True)

    pickle.dump(cmd_args,
                open(os.sep.join([output_folder, 'training_conf.pkl']), 'wb'))

    MIN_LENGTH_MOL = min_len_mol
    MAX_LENGTH_MOL = max_len_mol  # 2048
    NUM_EXAMPLES_TR = num_examples_tr  # 1024
    NUM_EXAMPLES_TS = num_examples_ts  # 1024
    N_EPOCHS = epochs  # 10
    VALIDATE_EVERY = validate_every
    SAVE_EVERY = save_every

    MOL_SEQ_LEN = MAX_LENGTH_MOL  # output_lang.max_len if (output_lang.max_len % 2) == 0  else output_lang.max_len + 1 # ??

    saved_mol_lang = os.sep.join([output_folder, 'mol_lang.pkl'])

    MAX_LENGTH_MOL = cmd_args.max_len_mol

    saved_target_lang = os.sep.join([output_folder, 'mol_lang.pkl'])

    if mrpc_test:
        mol_lang, tr_samples, ts_samples = readMRPC(
            molecule_file_tr=path_to_file_tr,
            molecule_file_ts=path_to_file_ts,
            saved_molecule_lang=saved_target_lang,
            num_examples_tr=NUM_EXAMPLES_TR,
            num_examples_ts=NUM_EXAMPLES_TS,
            min_len_molecule=MIN_LENGTH_MOL,
            max_len_molecule=MAX_LENGTH_MOL,
            shuffle=True)
    else:
        mol_lang, tr_samples, ts_samples = readMolecules(
            molecule_file_tr=path_to_file_tr,
            molecule_file_ts=path_to_file_ts,
            saved_molecule_lang=saved_target_lang,
            num_examples_tr=NUM_EXAMPLES_TR,
            num_examples_ts=NUM_EXAMPLES_TS,
            min_len_molecule=MIN_LENGTH_MOL,
            max_len_molecule=MAX_LENGTH_MOL,
            shuffle=True)

    pickle.dump(mol_lang, open(saved_mol_lang, 'wb'))

    train_dataset = MolecularSimilarityDataset(
        tr_samples, mol_lang, train_batch_size if device == 'cuda' else 1)
    test_dataset = MolecularSimilarityDataset(
        ts_samples, mol_lang, train_batch_size if device == 'cuda' else 1)

    MAX_SEQ_LEN = MOL_SEQ_LEN * 2
    print('Axial Embedding shape:', compute_axial_position_shape(MAX_SEQ_LEN))
    model = ReformerLM(
        num_tokens=mol_lang.n_words,
        dim=dim,
        bucket_size=bucket_size,
        depth=depth,
        heads=heads,
        n_hashes=n_hashes,
        max_seq_len=MAX_SEQ_LEN,
        ff_chunks=ff_chunks,
        attn_chunks=attn_chunks,
        weight_tie=True,
        weight_tie_embedding=True,
        axial_position_emb=True,
        axial_position_shape=compute_axial_position_shape(MAX_SEQ_LEN),
        axial_position_dims=(dim // 2, dim // 2),
        return_embeddings=True,
        use_full_attn=use_full_attn).to(device)

    linear_regressor = Linear(512, 2).to(device)

    model = TrainingWrapper(model, ignore_index=PAD_IDX,
                            pad_value=PAD_IDX).to(device)

    model_params = filter(lambda p: p.requires_grad, model.parameters())
    linear_params = filter(lambda p: p.requires_grad,
                           linear_regressor.parameters())

    SAVE_DIR = os.sep.join([output_folder, 'saved_model'])
    os.makedirs(SAVE_DIR, exist_ok=True)

    try:
        model_ckp_max = np.max(
            [int(ckp) for ckp in os.listdir(os.sep.join([SAVE_DIR, 'model']))])
    except:
        model_ckp_max = 0

    gpus_mini_batch = (train_batch_size // gradient_accumulation_steps
                       ) // torch.cuda.device_count()
    print('gpus_mini_batch:', gpus_mini_batch,
          'with gradient_accumulation_steps:', gradient_accumulation_steps)
    log_file = open(os.sep.join([output_folder, 'training_log.log']), 'a')
    log_file.write(
        "\n\n\n{}\tStarting new training from chekpoint: EncoderDecoder-{}\n".
        format(datetime.datetime.now(), model_ckp_max))
    log_file.flush()

    if use_deepspeed:
        if deepspeed_optimizer == False:
            print('No DeepSpeed optimizer found. Using RangerLars.')
            model_optimizer = RangerLars(model.parameters())
            linear_optimizer = RangerLars(linear_regressor.parameters())

            model_engine, model_optimizer, trainloader, _ = deepspeed.initialize(
                args=cmd_args,
                model=model,
                optimizer=model_optimizer,
                model_parameters=model_params,
                training_data=train_dataset)

            linear_engine, linear_optimizer, _, _ = deepspeed.initialize(
                args=cmd_args,
                model=linear_regressor,
                optimizer=linear_optimizer,
                model_parameters=linear_params)

        else:
            print('Found optimizer in the DeepSpeed configurations. Using it.')
            model_engine, model_optimizer, trainloader, _ = deepspeed.initialize(
                args=cmd_args,
                model=model,
                model_parameters=model_params,
                training_data=train_dataset)
            linear_engine, linear_optimizer, _, _ = deepspeed.initialize(
                args=cmd_args,
                model=linear_regressor,
                model_parameters=linear_params)

        _, model_client_sd = model_engine.load_checkpoint(
            os.sep.join([SAVE_DIR, 'model']), model_ckp_max)

        testloader = model_engine.deepspeed_io(test_dataset)

        ######TO DO
        for eph in range(epochs):
            print('Starting Epoch: {}'.format(eph))
            for i, pair in enumerate(tqdm(trainloader)):
                tr_step = ((eph * len(trainloader)) + i) + 1

                src = pair[0]
                trg = pair[1]

                pickle.dump(src, open('src.pkl', 'wb'))
                pickle.dump(trg, open('trg.pkl', 'wb'))

                model_engine.train()
                linear_engine.train()
                #enc_dec.train()

                src = src.to(model_engine.local_rank)
                trg = trg.to(linear_engine.local_rank)

                print("Sample:", src)
                print("Target:", trg)
                print("Target Shape:", trg.shape)
                print("len Samples:", len(src))

                ## Need to learn how to use masks correctly
                enc_input_mask = torch.tensor(
                    [[1 if idx != PAD_IDX else 0 for idx in smpl]
                     for smpl in src]).bool().to(model_engine.local_rank)

                # context_mask = torch.tensor([[1 for idx in smpl if idx != PAD_IDX] for smpl in trg]).bool().to(device)
                #################

                enc_keys = model_engine(
                    src, return_loss=False, input_mask=enc_input_mask
                )  #enc_input_mask)#, context_mask=context_mask)
                #loss = enc_dec(src, trg, return_loss = True, enc_input_mask = None)#enc_input_mask)#, context_mask=context_mask)

                print('enc_keys shape', enc_keys.shape)
                #enc_keys_cls = enc_keys[:,0:1,:].to(linear_engine.local_rank)#torch.tensor([s[0] for s in enc_keys]).to(linear_engine.local_rank)
                #print('enc_keys_cls shape', enc_keys_cls.shape)
                preds = torch.softmax(linear_engine(enc_keys),
                                      dim=1).to(linear_engine.local_rank)

                print('preds shape', preds.shape)
                #preds = np.array([r[0] for r in results])
                #print('Pred:', preds.shape)
                loss = F.cross_entropy(preds, trg).to(linear_engine.local_rank)
                loss.backward()

                model_engine.step()
                linear_engine.step()

                print('Training Loss:', loss.item())
                if tr_step % validate_every == 0:
                    val_loss = []
                    for pair in tqdm(
                            testloader
                    ):  #Can't use the testloader or I will mess up with the model assignment and it won't learn during training, need to use normal validation instead of parallel one
                        model_engine.eval()
                        linear_engine.eval()
                        with torch.no_grad():
                            ts_src = pair[0]
                            ts_trg = pair[1]

                            pickle.dump(ts_src, open('ts_src.pkl', 'wb'))
                            pickle.dump(ts_trg, open('ts_trg.pkl', 'wb'))

                            ts_src = ts_src.to(model_engine.local_rank)
                            ts_trg = ts_trg.to(linear_engine.local_rank)

                            #ts_src = torch.tensor(np.array([pair[0].numpy()])).to(device)
                            #ts_trg = torch.tensor(np.array([pair[1].numpy()])).to(device)

                            ## Need to learn how to use masks correctly
                            ts_enc_input_mask = torch.tensor([
                                [1 if idx != PAD_IDX else 0 for idx in smpl]
                                for smpl in ts_src
                            ]).bool().to(model_engine.local_rank)
                            #ts_context_mask = torch.tensor([[1 for idx in smpl if idx != PAD_IDX] for smpl in ts_trg]).bool().to(device)

                            # loss = model_engine(
                            #     ts_src,
                            #     ts_trg,
                            #     return_loss=True,
                            #     enc_input_mask=ts_enc_input_mask
                            # )  #ts_enc_input_mask)#, context_mask=ts_context_mask)
                            # #loss = enc_dec(ts_src, ts_trg, return_loss = True, enc_input_mask = None)

                            ts_enc_keys = model_engine(
                                ts_src,
                                return_loss=False,
                                input_mask=ts_enc_input_mask)
                            ts_pred = torch.softmax(
                                linear_engine(ts_enc_keys),
                                dim=1).to(linear_engine.local_rank)
                            loss = F.cross_entropy(ts_pred, ts_trg).to(
                                linear_engine.local_rank)
                            val_loss.append(loss.item())

                    print(
                        f'\tValidation Loss: AVG: {np.mean(val_loss)}, MEDIAN: {np.median(val_loss)}, STD: {np.std(val_loss)} '
                    )
                    log_file.write(
                        'Step: {}\tTraining Loss:{}\t Validation LOSS: AVG: {}| MEDIAN: {}| STD: {}\n'
                        .format(i, loss.item(), np.mean(val_loss),
                                np.median(val_loss), np.std(val_loss)))
                else:
                    log_file.write('Step: {}\tTraining Loss:{}\n'.format(
                        i, loss.item()))

                log_file.flush()

                if tr_step % save_every == 0:
                    print('\tSaving Checkpoint')
                    model_ckpt_id = str(model_ckp_max + tr_step + 1)
                    model_engine.save_checkpoint(
                        os.sep.join([SAVE_DIR, 'model']), model_ckpt_id)

        log_file.close()
        print('\tSaving Final Checkpoint')
        model_ckpt_id = str(model_ckp_max + tr_step + 1)
        model_engine.save_checkpoint(os.sep.join([SAVE_DIR, 'model']),
                                     model_ckpt_id)
    else:
        #model_optimizer = torch.optim.Adam(model.parameters()) # RangerLars(model.parameters())
        #linear_optimizer = torch.optim.Adam(linear_regressor.parameters())  # RangerLars(linear_regressor.parameters())

        model_optimizer = torch.optim.Adam(
            list(model.parameters()) + list(linear_regressor.parameters())
        )  #RangerLars(list(model.parameters())+list(linear_regressor.parameters())) #

        PATH = os.sep.join(
            [SAVE_DIR, 'model',
             str(model_ckp_max), 'sts_model.pt'])
        if os.path.exists(PATH):
            print('********** Found Checkpoint. Loading:', PATH)
            checkpoint = torch.load(PATH)

            model.load_state_dict(checkpoint['model_state_dict'])
            linear_regressor.load_state_dict(checkpoint['linear_state_dict'])
            model_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        trainloader = DataLoader(train_dataset,
                                 batch_size=train_batch_size,
                                 shuffle=False)
        testloader = DataLoader(test_dataset,
                                batch_size=train_batch_size,
                                shuffle=False)
        ######TO DO
        train_loss_list = []
        for eph in range(epochs):
            print('Starting Epoch: {}'.format(eph))
            for i, pair in enumerate(tqdm(trainloader)):
                tr_step = ((eph * len(trainloader)) + i) + 1

                src = pair[0]
                trg = pair[1]

                pickle.dump(src, open('src.pkl', 'wb'))
                pickle.dump(trg, open('trg.pkl', 'wb'))

                model.train()
                linear_regressor.train()
                #enc_dec.train()

                src = src.to(device)
                trg = trg.to(device)

                #print("Sample:", src)
                #print("Target:", trg)
                #print("Target Shape:", trg.shape)
                #print("len Samples:", len(src))

                ## Need to learn how to use masks correctly
                enc_input_mask = torch.tensor(
                    [[1 if idx != PAD_IDX else 0 for idx in smpl]
                     for smpl in src]).bool().to(device)

                # context_mask = torch.tensor([[1 for idx in smpl if idx != PAD_IDX] for smpl in trg]).bool().to(device)
                #################

                enc_keys = model(
                    src, return_loss=False, input_mask=enc_input_mask
                )  #enc_input_mask)#, context_mask=context_mask)
                #loss = enc_dec(src, trg, return_loss = True, enc_input_mask = None)#enc_input_mask)#, context_mask=context_mask)

                #print('enc_keys shape', enc_keys.shape)
                enc_keys_cls = enc_keys[:, 0, :].to(
                    device
                )  #torch.tensor([s[0] for s in enc_keys]).to(linear_engine.local_rank)
                #print('enc_keys_cls shape', enc_keys_cls.shape)
                preds = torch.softmax(linear_regressor(enc_keys_cls),
                                      dim=1).to(device)

                #print('preds shape', preds.shape)
                #preds = np.array([r[0] for r in results])
                #print('Pred:', preds.shape)
                loss = F.cross_entropy(preds, trg).to(device)
                loss.backward()

                model_optimizer.step()
                #linear_optimizer.step()

                train_loss_list.append(loss.item())
                #print('Training Loss:', loss.item())
                if tr_step % validate_every == 0:
                    val_loss = []
                    ACC_list = []
                    MCC_list = []
                    for pair in tqdm(
                            testloader
                    ):  #Can't use the testloader or I will mess up with the model assignment and it won't learn during training, need to use normal validation instead of parallel one
                        model.eval()
                        linear_regressor.eval()
                        with torch.no_grad():
                            ts_src = pair[0]
                            ts_trg = pair[1]

                            pickle.dump(ts_src, open('ts_src.pkl', 'wb'))
                            pickle.dump(ts_trg, open('ts_trg.pkl', 'wb'))

                            ts_src = ts_src.to(device)
                            ts_trg = ts_trg.to(device)

                            #ts_src = torch.tensor(np.array([pair[0].numpy()])).to(device)
                            #ts_trg = torch.tensor(np.array([pair[1].numpy()])).to(device)

                            ## Need to learn how to use masks correctly
                            ts_enc_input_mask = torch.tensor(
                                [[1 if idx != PAD_IDX else 0 for idx in smpl]
                                 for smpl in ts_src]).bool().to(device)
                            #ts_context_mask = torch.tensor([[1 for idx in smpl if idx != PAD_IDX] for smpl in ts_trg]).bool().to(device)

                            # loss = model_engine(
                            #     ts_src,
                            #     ts_trg,
                            #     return_loss=True,
                            #     enc_input_mask=ts_enc_input_mask
                            # )  #ts_enc_input_mask)#, context_mask=ts_context_mask)
                            # #loss = enc_dec(ts_src, ts_trg, return_loss = True, enc_input_mask = None)

                            ts_enc_keys = model(ts_src,
                                                return_loss=False,
                                                input_mask=ts_enc_input_mask)
                            ts_enc_keys_cls = ts_enc_keys[:, 0, :].to(device)

                            ts_pred = torch.softmax(
                                linear_regressor(ts_enc_keys_cls),
                                dim=1).to(device)

                            loss = F.cross_entropy(ts_pred, ts_trg).to(device)

                            ACC, MCC = compute_simple_metrics(ts_pred, ts_trg)
                            ACC_list.append(ACC)
                            MCC_list.append(MCC)

                            val_loss.append(loss.item())

                    print(
                        f'\Train Loss: LAST: {train_loss_list[-1]}, AVG: {np.mean(train_loss_list)}, MEDIAN: {np.median(train_loss_list)}, STD: {np.std(train_loss_list)} '
                    )
                    print(
                        f'\tValidation Loss: AVG: {np.mean(val_loss)}, MEDIAN: {np.median(val_loss)}, STD: {np.std(val_loss)} '
                    )
                    print(
                        f'\tValidation ACC: AVG: {np.mean(ACC_list)}, MEDIAN: {np.median(ACC_list)}, STD: {np.std(ACC_list)} '
                    )
                    print(
                        f'\tValidation MCC: AVG: {np.mean(MCC_list)}, MEDIAN: {np.median(MCC_list)}, STD: {np.std(MCC_list)} '
                    )
                    log_file.write(
                        'Step: {}\tTraining Loss:{}\t Validation LOSS: AVG: {}| MEDIAN: {}| STD: {}\n'
                        .format(i, loss.item(), np.mean(val_loss),
                                np.median(val_loss), np.std(val_loss)))
                else:
                    log_file.write('Step: {}\tTraining Loss:{}\n'.format(
                        i, loss.item()))

                log_file.flush()

                if tr_step % save_every == 0:
                    print('\tSaving Checkpoint')
                    model_ckpt_id = str(model_ckp_max + tr_step + 1)
                    #model_engine.save_checkpoint(os.sep.join([SAVE_DIR, 'model']),
                    #                            model_ckpt_id)
                    PATH = os.sep.join([
                        SAVE_DIR, 'model',
                        str(model_ckpt_id), 'sts_model.pt'
                    ])
                    os.makedirs(os.sep.join(PATH.split(os.sep)[:-1]),
                                exist_ok=True)
                    torch.save(
                        {
                            'step': tr_step,
                            'model_state_dict': model.state_dict(),
                            'linear_state_dict': linear_regressor.state_dict(),
                            'optimizer_state_dict':
                            model_optimizer.state_dict(),
                        }, PATH)

        log_file.close()
        print('\tSaving Final Checkpoint')
        model_ckpt_id = str(model_ckp_max + tr_step + 1)
        #model_engine.save_checkpoint(os.sep.join([SAVE_DIR, 'model']),
        #                            model_ckpt_id)
        PATH = os.sep.join(
            [SAVE_DIR, 'model',
             str(model_ckpt_id), 'sts_model.pt'])
        os.makedirs(os.sep.join(PATH.split(os.sep)[:-1]), exist_ok=True)
        torch.save(
            {
                'step': tr_step,
                'model_state_dict': model.state_dict(),
                'linear_state_dict': linear_regressor.state_dict(),
                'optimizer_state_dict': model_optimizer.state_dict(),
            }, PATH)