mask_train = mask[val_size:]
    labels_train = labels[val_size:]

    # Use dataloader to handle batches
    train_ds = TensorDataset(input_ids_train, mask_train, labels_train)
    val_ds = TensorDataset(input_ids_val, mask_val, labels_val)

    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=batch_size)

    # initialise grader
    model = BERTGrader()
    model.to(device)

    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, eps=1e-8)

    # Scheduler
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                     milestones=[sch])

    # Criterion
    criterion = torch.nn.MSELoss(reduction='mean')

    # Train
    for epoch in range(epochs):

        # train for one epoch
        print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr']))
        train(train_dl, model, criterion, optimizer, epoch, device)
        scheduler.step()
    X_val = X[:num_points_val]
    labels_val = labels[:num_points_val]
    X_train = X[num_points_val:]
    labels_train = labels[num_points_val:]

    ds_train = TensorDataset(X_train, labels_train)
    ds_val = TensorDataset(X_val, labels_val)
    dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True)
    dl_val = DataLoader(ds_val, batch_size=batch_size)

    # Model
    model = LayerClassifier(num_comps)
    model.to(device)

    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Criterion
    criterion = nn.CrossEntropyLoss().to(device)

    # Create file
    N = len(attack_phrase.split())
    with open(out_file, 'w') as f:
        text = f'Head {head_num}, Comps {num_comps}, N {N}\n'
        f.write(text)

    # Train
    for epoch in range(epochs):

        # train for one epoch
        text = '\n current lr {:.5e}'.format(optimizer.param_groups[0]['lr'])