valid_dataset = dataset.EntityDataset(
        texts=test_sentences, pos=test_pos, tags=test_tag
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
    )

    
    #device = torch.device("cuda")
    device = torch.device("cpu")
    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
def run():
    sentences, pos, tag, enc_pos, enc_tag = utils.process_data(config.DATA_FILE)

    meta_data = {
        "enc_pos": enc_pos,
        "enc_tag": enc_tag
    }

    joblib.dump(meta_data, "meta.bin")

    num_pos = len(list(enc_pos.classes_))
    num_tag = len(list(enc_tag.classes_))

    (
        train_sentences,
        test_sentences,
        train_pos,
        test_pos,
        train_tag,
        test_tag
    ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1)

    train_dataset = dataset.EntityDataset(
        texts = train_sentences, pos=train_pos, tags=train_tag
    )

    test_dataset  = dataset.EntityDataset(
        texts = test_sentences, pos=test_pos, tags=test_tag
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size = config.TRAIN_BATCH_SIZE, num_workers=4
    )

    test_data_loader  = torch.utils.data.DataLoader(
        test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
    )

    device = torch.device("cuda")

    model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay        = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_param = [
        {
            "params" : [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params" : [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS )
    optimizer = AdamW(optimizer_param, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_loss = np.inf

    for epoch in range(config.EPOCHS):
        train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        test_loss  = engine.eval_fn(test_data_loader, model, device)
        print(f"Train Loss = {train_loss} Valod Loss = {test_loss}")
        if test_loss < best_loss:
            torch.save(model.state_dict(), config.MODEL_SAVE_PATH)
            best_loss = test_loss