from datasets.conll2003 import Conll2003Dataset
from models.bilstm_crf.bilstm_crf import BilstmCRF

if __name__ == "__main__":

    device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
    batch_size = 200
    print(f"Device: {device}")
    models_dir = 'experiments/models/saves/'

    # ----------------------------------- Prepare dataset ------------------------------------

    TASK_TYPE = 'NER'
    LABELED_UNLABELED_RATIO = 0.1
    ID_sim_beta = 1
    dataset = Conll2003Dataset(
        save_file_path='datasets/saves/conll2003NER.pkl', task='NER')

    train_dataset, test_dataset = dataset.train_test_split()
    ltrain_dataset, utrain_dataset = train_dataset.split(
        LABELED_UNLABELED_RATIO)
    labeled_train_x, labeled_train_y = ltrain_dataset.x_embeddings, ltrain_dataset.y
    unlabeled_train_x, unlabeled_train_y = utrain_dataset.x_embeddings, utrain_dataset.y
    test_dataset_x, test_dataset_y = test_dataset.x_embeddings, test_dataset.y
    print(f"labelled size: {len(labeled_train_y)}",
          f"Unlabelled size: {len(unlabeled_train_y)}",
          f"test size: {len(test_dataset_y)})")
    # ------------------------------------ create model ---------------------------------------
    dropout_rate = 0.5
    model = BilstmCRF(dataset.max_word_idx + 1,
                      dataset.max_tag_idx + 1,
                      300,
Example #2
0
from active_learning.utils import active_learning_loop_limited_tokens

from datasets.conll2003 import Conll2003Dataset
from models.supervised.supervised_mc import SupervisedModelAL

if __name__ == "__main__":

    device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
    print(f"Device: {device}")
    models_dir = 'experiments/models/saves/'

    # ----------------------------------- Prepare dataset ------------------------------------
    TASK_TYPE = 'NER' if len(sys.argv) < 2 else sys.argv[1]
    LABELED_UNLABELED_RATIO = 0.1
    ID_sim_beta = 1
    train_dataset, test_dataset = Conll2003Dataset(
        task=TASK_TYPE).train_test_split()
    ltrain_dataset, utrain_dataset = train_dataset.split(
        LABELED_UNLABELED_RATIO)
    labeled_train_x, labeled_train_y = ltrain_dataset.x_embeddings, ltrain_dataset.y
    unlabeled_train_x, unlabeled_train_y = utrain_dataset.x_embeddings, utrain_dataset.y
    test_dataset_x, test_dataset_y = test_dataset.x_embeddings, test_dataset.y

    print(f"labelled size: {len(labeled_train_y)}",
          f"Unlabelled size: {len(unlabeled_train_y)}",
          f"test size: {len(test_dataset_y)})")
    # ------------------------------------ create model ---------------------------------------
    dropout_rate = 0.4
    model = SupervisedModelAL(train_dataset.max_tag_idx + 1, 100, 2,
                              dropout_rate, dropout_rate)

    if device.type == 'cuda':
from datasets.conll2003 import Conll2003Dataset
from models.bilstm_crf.bilstm_crf import BilstmCRF

if __name__ == "__main__":

    device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
    batch_size = 200
    print(f"Device: {device}")
    models_dir = 'experiments/models/saves/'

    # ----------------------------------- Prepare dataset ------------------------------------

    TASK_TYPE = 'POS'
    LABELED_UNLABELED_RATIO = 0.1
    ID_sim_beta = 1
    dataset = Conll2003Dataset(
        save_file_path='datasets/saves/conll2003POS.pkl', task='POS')

    train_dataset, test_dataset = dataset.train_test_split()
    ltrain_dataset, utrain_dataset = train_dataset.split(
        LABELED_UNLABELED_RATIO)
    labeled_train_x, labeled_train_y = ltrain_dataset.x, ltrain_dataset.y
    unlabeled_train_x, unlabeled_train_y = utrain_dataset.x, utrain_dataset.y
    test_dataset_x, test_dataset_y = test_dataset.x, test_dataset.y
    print(f"labelled size: {len(labeled_train_y)}",
          f"Unlabelled size: {len(unlabeled_train_y)}",
          f"test size: {len(test_dataset_y)})")
    # ------------------------------------ create model ---------------------------------------
    dropout_rate = 0.5
    model = BilstmCRF(dataset.max_word_idx + 1,
                      dataset.max_tag_idx + 1,
                      30,