Exemple #1
0
def setup_simple_classifier():
    BATCH_SIZE = 1
    NUM_TOKENS = 3
    EMB_DIM = 300
    VOCAB_SIZE = 10
    NUM_CLASSES = 3
    embedding = Embedding.from_pretrained(torch.zeros([VOCAB_SIZE, EMB_DIM]))
    embedder = VanillaEmbedder(embedding_dim=EMB_DIM, embedding=embedding)
    labels = torch.LongTensor([[1]])
    encoder = BOW_Encoder(emb_dim=EMB_DIM,
                          embedder=embedder,
                          dropout_value=0,
                          aggregation_type="sum")
    tokens = np.random.randint(0,
                               VOCAB_SIZE - 1,
                               size=(BATCH_SIZE, NUM_TOKENS))
    tokens = torch.LongTensor(tokens)
    simple_classifier = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMB_DIM,
        num_classes=NUM_CLASSES,
        classification_layer_bias=False,
    )
    iter_dict = {"tokens": tokens, "label": labels}
    return iter_dict, simple_classifier, BATCH_SIZE, NUM_CLASSES
Exemple #2
0
def setup_zero_embeddings(request):
    EMB_DIM = 300
    VOCAB_SIZE = 10
    BATCH_SIZE = 10
    aggregation_type = request.param[0]
    embedding_type = request.param[1]
    embedding = None
    if embedding_type == "zeros":
        embedding = torch.zeros([VOCAB_SIZE, EMB_DIM])
    elif embedding_type == "ones":
        embedding = torch.ones([VOCAB_SIZE, EMB_DIM])
    embedding = nn.Embedding.from_pretrained(embedding)
    embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMB_DIM)
    encoder = BOW_Encoder(emb_dim=EMB_DIM,
                          embedder=embedder,
                          aggregation_type=aggregation_type)
    tokens = np.random.randint(0, VOCAB_SIZE - 1, size=(BATCH_SIZE, EMB_DIM))
    tokens = torch.LongTensor(tokens)
    iter_dict = {"tokens": tokens}
    options = {
        "EMB_DIM": EMB_DIM,
        "VOCAB_SIZE": VOCAB_SIZE,
        "BATCH_SIZE": BATCH_SIZE,
        "EMBEDDING_TYPE": embedding_type,
        "AGGREGATION_TYPE": aggregation_type,
    }
    return encoder, iter_dict, options
Exemple #3
0
def setup_lstm2seqencoder(request):
    EMBEDDING_DIM = 100
    VOCAB_SIZE = 1000
    BATCH_SIZE = 2
    HIDDEN_DIM = 1024
    NUM_TIME_STEPS = 10
    BIDIRECTIONAL = request.param[0]
    COMBINE_STRATEGY = request.param[1]
    NUM_LAYERS = request.param[2]
    EMBEDDING = nn.Embedding.from_pretrained(
        torch.zeros([VOCAB_SIZE, EMBEDDING_DIM]))
    tokens = np.random.randint(0,
                               VOCAB_SIZE - 1,
                               size=(BATCH_SIZE, NUM_TIME_STEPS))
    tokens = torch.LongTensor(tokens)

    embedder = VanillaEmbedder(embedding=EMBEDDING,
                               embedding_dim=EMBEDDING_DIM)

    encoder = Lstm2SeqEncoder(
        emb_dim=EMBEDDING_DIM,
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
        num_layers=NUM_LAYERS,
    )

    return (
        encoder,
        {
            "EMBEDDING_DIM":
            EMBEDDING_DIM,
            "VOCAB_SIZE":
            VOCAB_SIZE,
            "BATCH_SIZE":
            BATCH_SIZE,
            "HIDDEN_DIM":
            HIDDEN_DIM,
            "COMBINE_STRATEGY":
            COMBINE_STRATEGY,
            "BIDIRECTIONAL":
            BIDIRECTIONAL,
            "tokens":
            tokens,
            "EXPECTED_HIDDEN_DIM":
            2 * HIDDEN_DIM
            if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM,
            "TIME_STEPS":
            NUM_TIME_STEPS,
            "NUM_LAYERS":
            NUM_LAYERS,
        },
    )
Exemple #4
0
    def test_raises_error_on_wrong_combine_strategy(self,
                                                    setup_lstm2vecencoder):
        with pytest.raises(AssertionError):

            encoder = LSTM2VecEncoder(
                emb_dim=300,
                embedder=VanillaEmbedder(nn.Embedding(10, 1024),
                                         embedding_dim=1024),
                combine_strategy="add",
            )
Exemple #5
0
def setup_lstm2vecencoder(request):
    emb_dim = 300
    time_steps = 10
    vocab_size = 100
    batch_size = 32
    embedding = nn.Embedding.from_pretrained(torch.zeros([vocab_size,
                                                          emb_dim]))
    hidden_dimension = 1024
    combine_strategy = request.param[1]
    bidirectional = request.param[0]
    tokens = np.random.randint(0,
                               vocab_size - 1,
                               size=(batch_size, time_steps))
    tokens = torch.LongTensor(tokens)

    iter_dict = {"tokens": tokens}
    embedder = VanillaEmbedder(embedding=embedding, embedding_dim=emb_dim)

    encoder = LSTM2VecEncoder(
        emb_dim=emb_dim,
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=hidden_dimension,
        bidirectional=bidirectional,
        combine_strategy=combine_strategy,
        rnn_bias=False,
    )

    return (
        encoder,
        {
            "emb_dim":
            emb_dim,
            "vocab_size":
            vocab_size,
            "hidden_dim":
            2 * hidden_dimension if bidirectional
            and combine_strategy == "concat" else hidden_dimension,
            "bidirectional":
            False,
            "combine_strategy":
            combine_strategy,
            "tokens":
            tokens,
            "batch_size":
            batch_size,
            "iter_dict":
            iter_dict,
        },
    )
Exemple #6
0
def get_bow_lc_parsect_infer(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    EMBEDDING_DIMENSION = config["EMBEDDING_DIMENSION"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]
    VOCAB_SIZE = config["VOCAB_SIZE"]
    NUM_CLASSES = config["NUM_CLASSES"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIMENSION)
    embedder = VanillaEmbedder(embedding_dim=EMBEDDING_DIMENSION,
                               embedding=embedding)
    encoder = BOW_Encoder(
        emb_dim=EMBEDDING_DIMENSION,
        embedder=embedder,
        dropout_value=0.0,
        aggregation_type="sum",
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMBEDDING_DIMENSION,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = SectLabelDataset(**test_dataset_args)

    dataset.print_stats()

    parsect_inference = ClassificationInference(model=model,
                                                model_filepath=model_filepath,
                                                dataset=dataset)

    return parsect_inference
Exemple #7
0
def embedder():
    batch_size = 32
    time_steps = 10
    vocab_size = 3000
    embedding_dim = 300

    embedding = nn.Embedding.from_pretrained(embeddings=torch.rand(
        vocab_size, embedding_dim),
                                             freeze=False)
    tokens = torch.LongTensor(
        torch.randint(0, vocab_size, size=(batch_size, time_steps)))
    options = {
        "tokens": tokens,
        "embedding_size": embedding_dim,
        "batch_size": batch_size,
        "time_steps": time_steps,
    }
    embedder = VanillaEmbedder(embedding=embedding,
                               embedding_dim=embedding_dim)
    return embedder, options
Exemple #8
0
        idx - 8: classname
        for idx, classname in idx2classnames.items() if idx in range(8, 16)
    }
    material_idx2classnames = {
        idx - 16: classname
        for idx, classname in idx2classnames.items() if idx in range(16, 24)
    }

    task_constraints = allowed_transitions(constraint_type="BIOUL",
                                           labels=task_idx2classnames)
    process_constraints = allowed_transitions(constraint_type="BIOUL",
                                              labels=process_idx2classnames)
    material_constraints = allowed_transitions(constraint_type="BIOUL",
                                               labels=material_idx2classnames)

    embedder = VanillaEmbedder(embedding=embedding,
                               embedding_dim=EMBEDDING_DIMENSION)

    if USE_CHAR_ENCODER:
        char_embedder = VanillaEmbedder(embedding=char_embedding,
                                        embedding_dim=CHAR_EMBEDDING_DIMENSION)
        char_encoder = CharLSTMEncoder(
            char_emb_dim=CHAR_EMBEDDING_DIMENSION,
            char_embedder=char_embedder,
            bidirectional=True,
            hidden_dim=CHAR_ENCODER_HIDDEN_DIM,
            combine_strategy="concat",
            device=torch.device(DEVICE),
        )
        embedder = ConcatEmbedders([embedder, char_encoder])
        EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM
Exemple #9
0
def get_elmo_bilstm_lc_infer(dirname: str):

    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    DEVICE = config["DEVICE"]
    EMBEDDING_DIM = config["EMBEDDING_DIMENSION"]
    VOCAB_SIZE = config["VOCAB_SIZE"]
    HIDDEN_DIM = config["HIDDEN_DIMENSION"]
    BIDIRECTIONAL = config["BIDIRECTIONAL"]
    COMBINE_STRATEGY = config["COMBINE_STRATEGY"]
    NUM_CLASSES = config["NUM_CLASSES"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)

    elmo_embedder = BowElmoEmbedder(
        layer_aggregation="sum",
        cuda_device_id=-1 if DEVICE == "cpu" else int(
            DEVICE.split("cuda:")[1]),
    )

    vanilla_embedder = VanillaEmbedder(embedding=embedding,
                                       embedding_dim=EMBEDDING_DIM)

    embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder])

    encoder = LSTM2VecEncoder(
        emb_dim=EMBEDDING_DIM + 1024,
        embedder=embedders,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        device=torch.device(DEVICE),
    )

    encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL
                    and COMBINE_STRATEGY == "concat" else HIDDEN_DIM)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=encoding_dim,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = SectLabelDataset(**test_dataset_args)

    inference = ClassificationInference(model=model,
                                        model_filepath=model_filepath,
                                        dataset=dataset)
    return inference
Exemple #10
0
    def setup_engine_once(
        config_dict: Dict[str, str],
        experiment_name: str,
        train_data_filepath: pathlib.Path,
        test_data_filepath: pathlib.Path,
    ):
        DEBUG = config_dict["DEBUG"]
        DEBUG_DATASET_PROPORTION = config_dict["DEBUG_DATASET_PROPORTION"]
        BATCH_SIZE = config_dict["BATCH_SIZE"]
        LEARNING_RATE = config_dict["LEARNING_RATE"]
        NUM_EPOCHS = config_dict["NUM_EPOCHS"]
        SAVE_EVERY = config_dict["SAVE_EVERY"]
        LOG_TRAIN_METRICS_EVERY = config_dict["LOG_TRAIN_METRICS_EVERY"]
        EMBEDDING_DIMENSION = config_dict["EMBEDDING_DIMENSION"]
        CHAR_EMBEDDING_DIMENSION = config_dict["CHAR_EMBEDDING_DIMENSION"]
        EMBEDDING_TYPE = config_dict["EMBEDDING_TYPE"]
        MAX_NUM_WORDS = config_dict["MAX_NUM_WORDS"]
        MAX_LENGTH = config_dict["MAX_LENGTH"]
        DEVICE = config_dict["DEVICE"]
        HIDDEN_DIM = config_dict["HIDDEN_DIM"]
        BIDIRECTIONAL = config_dict["BIDIRECTIONAL"]
        COMBINE_STRATEGY = config_dict["COMBINE_STRATEGY"]
        MAX_CHAR_LENGTH = config_dict["MAX_CHAR_LENGTH"]
        USE_CHAR_ENCODER = config_dict["USE_CHAR_ENCODER"]
        CHAR_ENCODER_HIDDEN_DIM = config_dict["CHAR_ENCODER_HIDDEN_DIM"]
        DROPOUT = config_dict["DROPOUT"]

        EXP_NAME = experiment_name
        EXP_DIR_PATH = os.path.join(OUTPUT_DIR, EXP_NAME)

        if not os.path.isdir(EXP_DIR_PATH):
            os.mkdir(EXP_DIR_PATH)

        MODEL_SAVE_DIR = os.path.join(EXP_DIR_PATH, "checkpoints")

        if not os.path.isdir(MODEL_SAVE_DIR):
            os.mkdir(MODEL_SAVE_DIR)

        VOCAB_STORE_LOCATION = os.path.join(EXP_DIR_PATH, "vocab.json")
        CHAR_VOCAB_STORE_LOCATION = os.path.join(EXP_DIR_PATH,
                                                 "char_vocab.json")
        CAPITALIZATION_VOCAB_STORE_LOCATION = os.path.join(
            EXP_DIR_PATH, "capitalization_vocab.json")
        CAPITALIZATION_EMBEDDING_DIMENSION = 10
        TENSORBOARD_LOGDIR = os.path.join(".", "runs", EXP_NAME)

        train_dataset = ParscitDataset(
            filename=str(train_data_filepath),
            dataset_type="train",
            max_num_words=MAX_NUM_WORDS,
            max_instance_length=MAX_LENGTH,
            max_char_length=MAX_CHAR_LENGTH,
            word_vocab_store_location=VOCAB_STORE_LOCATION,
            char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
            captialization_vocab_store_location=
            CAPITALIZATION_VOCAB_STORE_LOCATION,
            capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION,
            debug=DEBUG,
            debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
            word_embedding_type=EMBEDDING_TYPE,
            word_embedding_dimension=EMBEDDING_DIMENSION,
            char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
            word_start_token="<SOS>",
            word_end_token="<EOS>",
            word_pad_token="<PAD>",
            word_unk_token="<UNK>",
            word_add_start_end_token=False,
        )

        validation_dataset = ParscitDataset(
            filename=str(test_data_filepath),
            dataset_type="valid",
            max_num_words=MAX_NUM_WORDS,
            max_instance_length=MAX_LENGTH,
            max_char_length=MAX_CHAR_LENGTH,
            word_vocab_store_location=VOCAB_STORE_LOCATION,
            char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
            captialization_vocab_store_location=
            CAPITALIZATION_VOCAB_STORE_LOCATION,
            capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION,
            debug=DEBUG,
            debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
            word_embedding_type=EMBEDDING_TYPE,
            word_embedding_dimension=EMBEDDING_DIMENSION,
            char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
            word_start_token="<SOS>",
            word_end_token="<EOS>",
            word_pad_token="<PAD>",
            word_unk_token="<UNK>",
            word_add_start_end_token=False,
        )

        test_dataset = ParscitDataset(
            filename=str(test_data_filepath),
            dataset_type="test",
            max_num_words=MAX_NUM_WORDS,
            max_instance_length=MAX_LENGTH,
            max_char_length=MAX_CHAR_LENGTH,
            word_vocab_store_location=VOCAB_STORE_LOCATION,
            char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
            captialization_vocab_store_location=
            CAPITALIZATION_VOCAB_STORE_LOCATION,
            capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION,
            debug=DEBUG,
            debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
            word_embedding_type=EMBEDDING_TYPE,
            word_embedding_dimension=EMBEDDING_DIMENSION,
            char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
            word_start_token="<SOS>",
            word_end_token="<EOS>",
            word_pad_token="<PAD>",
            word_unk_token="<UNK>",
            word_add_start_end_token=False,
        )

        train_dataset.print_stats()
        validation_dataset.print_stats()
        test_dataset.print_stats()

        VOCAB_SIZE = train_dataset.word_vocab.get_vocab_len()
        NUM_CLASSES = train_dataset.get_num_classes()
        embedding = train_dataset.word_vocab.load_embedding()
        embedding = nn.Embedding.from_pretrained(embedding, freeze=False)
        char_embedding = train_dataset.char_vocab.load_embedding()
        char_embedding = nn.Embedding.from_pretrained(char_embedding,
                                                      freeze=False)

        embedder = VanillaEmbedder(embedding=embedding,
                                   embedding_dim=EMBEDDING_DIMENSION)

        if USE_CHAR_ENCODER:
            char_embedder = VanillaEmbedder(
                embedding=char_embedding,
                embedding_dim=CHAR_EMBEDDING_DIMENSION)
            char_encoder = CharLSTMEncoder(
                char_emb_dim=CHAR_EMBEDDING_DIMENSION,
                char_embedder=char_embedder,
                bidirectional=True,
                hidden_dim=CHAR_ENCODER_HIDDEN_DIM,
                combine_strategy="concat",
                device=torch.device(DEVICE),
            )
            embedder = ConcatEmbedders([embedder, char_encoder])
            EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM

        lstm2seqencoder = Lstm2SeqEncoder(
            emb_dim=EMBEDDING_DIMENSION,
            embedder=embedder,
            dropout_value=DROPOUT,
            hidden_dim=HIDDEN_DIM,
            bidirectional=BIDIRECTIONAL,
            combine_strategy=COMBINE_STRATEGY,
            rnn_bias=True,
            device=torch.device(DEVICE),
        )
        model = ParscitTagger(
            rnn2seqencoder=lstm2seqencoder,
            num_classes=NUM_CLASSES,
            hid_dim=2 * HIDDEN_DIM
            if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
        )

        optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
        metric = TokenClassificationAccuracy(
            idx2labelname_mapping=train_dataset.idx2classname)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer=optimizer, mode="max", factor=0.1, patience=2)

        engine = Engine(
            model=model,
            train_dataset=train_dataset,
            validation_dataset=validation_dataset,
            test_dataset=test_dataset,
            optimizer=optimizer,
            batch_size=BATCH_SIZE,
            save_dir=MODEL_SAVE_DIR,
            num_epochs=NUM_EPOCHS,
            save_every=SAVE_EVERY,
            log_train_metrics_every=LOG_TRAIN_METRICS_EVERY,
            tensorboard_logdir=TENSORBOARD_LOGDIR,
            device=torch.device(DEVICE),
            metric=metric,
            track_for_best="macro_fscore",
            lr_scheduler=scheduler,
        )

        config_dict["VOCAB_STORE_LOCATION"] = VOCAB_STORE_LOCATION
        config_dict["CHAR_VOCAB_STORE_LOCATION"] = CHAR_VOCAB_STORE_LOCATION
        config_dict["MODEL_SAVE_DIR"] = MODEL_SAVE_DIR
        config_dict["VOCAB_SIZE"] = VOCAB_SIZE
        config_dict["NUM_CLASSES"] = NUM_CLASSES

        with open(os.path.join(f"{EXP_DIR_PATH}", "config.json"), "w") as fp:
            json.dump(config_dict, fp)

        return engine
Exemple #11
0
def get_science_ie_infer(dirname: str):
    model_folder = pathlib.Path(dirname)
    hyperparam_config_filename = model_folder.joinpath("config.json")

    with open(hyperparam_config_filename, "r") as fp:
        config = json.load(fp)

    MAX_NUM_WORDS = config.get("MAX_NUM_WORDS", None)
    MAX_LENGTH = config.get("MAX_LENGTH", None)
    MAX_CHAR_LENGTH = config.get("MAX_CHAR_LENGTH", None)
    VOCAB_STORE_LOCATION = config.get("VOCAB_STORE_LOCATION", None)
    DEBUG = config.get("DEBUG", None)
    DEBUG_DATASET_PROPORTION = config.get("DEBUG_DATASET_PROPORTION", None)
    EMBEDDING_TYPE = config.get("EMBEDDING_TYPE", None)
    EMBEDDING_DIMENSION = config.get("EMBEDDING_DIMENSION", None)
    HIDDEN_DIMENSION = config.get("HIDDEN_DIM", None)
    BIDIRECTIONAL = config.get("BIDIRECTIONAL", None)
    COMBINE_STRATEGY = config.get("COMBINE_STRATEGY", None)
    DEVICE = config.get("DEVICE", "cpu")
    NUM_CLASSES = config.get("NUM_CLASSES", None)
    MODEL_SAVE_DIR = config.get("MODEL_SAVE_DIR", None)
    model_filepath = pathlib.Path(MODEL_SAVE_DIR, "best_model.pt")
    CHAR_VOCAB_STORE_LOCATION = config.get("CHAR_VOCAB_STORE_LOCATION", None)
    CHAR_EMBEDDING_DIMENSION = config.get("CHAR_EMBEDDING_DIMENSION", None)
    USE_CHAR_ENCODER = config.get("USE_CHAR_ENCODER", None)
    CHAR_ENCODER_HIDDEN_DIM = config.get("CHAR_ENCODER_HIDDEN_DIM", None)
    NUM_LAYERS = config.get("NUM_LAYERS", 1)
    DROPOUT = config.get("DROPOUT", 0.0)

    print(f"NUM_LAYERS", NUM_LAYERS)

    test_science_ie_conll_filepath = pathlib.Path(DATA_DIR, "dev_science_ie_conll.txt")

    test_dataset = ScienceIEDataset(
        filename=test_science_ie_conll_filepath,
        dataset_type="test",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        max_char_length=MAX_CHAR_LENGTH,
        word_vocab_store_location=VOCAB_STORE_LOCATION,
        char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
        debug=DEBUG,
        debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
        word_embedding_type=EMBEDDING_TYPE,
        word_embedding_dimension=EMBEDDING_DIMENSION,
        char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
        word_start_token="<SOS>",
        word_end_token="<EOS>",
        word_pad_token="<PAD>",
        word_unk_token="<UNK>",
        word_add_start_end_token=False,
    )

    embedding = test_dataset.word_vocab.load_embedding()
    embedding = nn.Embedding.from_pretrained(embedding)

    char_embedding = test_dataset.char_vocab.load_embedding()
    char_embedding = nn.Embedding.from_pretrained(char_embedding)

    classnames2idx = ScienceIEDataset.get_classname2idx()
    idx2classnames = {idx: classname for classname, idx in classnames2idx.items()}

    task_idx2classnames = {
        idx: classname
        for idx, classname in idx2classnames.items()
        if idx in range(0, 8)
    }
    process_idx2classnames = {
        idx - 8: classname
        for idx, classname in idx2classnames.items()
        if idx in range(8, 16)
    }
    material_idx2classnames = {
        idx - 16: classname
        for idx, classname in idx2classnames.items()
        if idx in range(16, 24)
    }

    task_constraints = allowed_transitions(
        constraint_type="BIOUL", labels=task_idx2classnames
    )
    process_constraints = allowed_transitions(
        constraint_type="BIOUL", labels=process_idx2classnames
    )
    material_constraints = allowed_transitions(
        constraint_type="BIOUL", labels=material_idx2classnames
    )

    embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIMENSION)

    if USE_CHAR_ENCODER:
        char_embedder = VanillaEmbedder(
            embedding=char_embedding, embedding_dim=CHAR_EMBEDDING_DIMENSION
        )
        char_encoder = CharLSTMEncoder(
            char_embedder=char_embedder,
            char_emb_dim=CHAR_EMBEDDING_DIMENSION,
            bidirectional=True,
            hidden_dim=CHAR_ENCODER_HIDDEN_DIM,
            combine_strategy="concat",
            device=torch.device(DEVICE),
        )
        embedder = ConcatEmbedders([embedder, char_encoder])
        EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM

    lstm2seqencoder = Lstm2SeqEncoder(
        emb_dim=EMBEDDING_DIMENSION,
        embedder=embedder,
        dropout_value=DROPOUT,
        hidden_dim=HIDDEN_DIMENSION,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        num_layers=NUM_LAYERS,
        rnn_bias=True,
        device=torch.device(DEVICE),
    )
    model = ScienceIETagger(
        rnn2seqencoder=lstm2seqencoder,
        num_classes=NUM_CLASSES,
        hid_dim=2 * HIDDEN_DIMENSION
        if BIDIRECTIONAL and COMBINE_STRATEGY == "concat"
        else HIDDEN_DIMENSION,
        task_constraints=task_constraints,
        process_constraints=process_constraints,
        material_constraints=material_constraints,
        device=torch.device(DEVICE),
    )

    inference_client = ScienceIEInference(
        model=model, model_filepath=str(model_filepath), dataset=test_dataset
    )
    return inference_client
Exemple #12
0
def setup_science_ie_tagger(request):
    EMBEDDING_DIM = 100
    CHARACTER_EMBEDDING_DIM = 25
    VOCAB_SIZE = 1000
    BATCH_SIZE = 2
    HIDDEN_DIM = 1024
    CHARACTER_ENCODER_HIDDEN_DIM = 100
    NUM_TIME_STEPS = 10
    MAX_CHAR_LENGTH = 25
    CHAR_VOCAB_SIZE = 100
    BIDIRECTIONAL = request.param[0]
    COMBINE_STRATEGY = request.param[1]
    HAVE_CHARACTER_ENCODER = request.param[2]
    DEVICE = torch.device("cpu")
    NUM_CLASSES = 8
    EMBEDDING = nn.Embedding.from_pretrained(
        torch.zeros([VOCAB_SIZE, EMBEDDING_DIM]))
    CHARACTER_EMBEDDING = nn.Embedding.from_pretrained(
        torch.zeros([CHAR_VOCAB_SIZE, CHARACTER_EMBEDDING_DIM]))
    tokens = np.random.randint(0,
                               VOCAB_SIZE,
                               size=(BATCH_SIZE, NUM_TIME_STEPS))

    task_labels = np.random.randint(0, 8, size=(BATCH_SIZE, NUM_TIME_STEPS))
    process_labels = np.random.randint(8,
                                       16,
                                       size=(BATCH_SIZE, NUM_TIME_STEPS))
    material_labels = np.random.randint(16,
                                        24,
                                        size=(BATCH_SIZE, NUM_TIME_STEPS))
    task_labels = torch.LongTensor(task_labels)
    process_labels = torch.LongTensor(process_labels)
    material_labels = torch.LongTensor(material_labels)
    labels = torch.cat([task_labels, process_labels, material_labels], dim=1)

    char_tokens = np.random.randint(0,
                                    CHAR_VOCAB_SIZE - 1,
                                    size=(BATCH_SIZE, NUM_TIME_STEPS,
                                          MAX_CHAR_LENGTH))
    tokens = torch.LongTensor(tokens)
    labels = torch.LongTensor(labels)
    char_tokens = torch.LongTensor(char_tokens)

    classnames2idx = ScienceIEDataset.get_classname2idx()
    idx2classnames = {
        idx: classname
        for classname, idx in classnames2idx.items()
    }
    task_idx2classnames = {
        idx: classname
        for idx, classname in idx2classnames.items() if idx in range(0, 8)
    }
    process_idx2classnames = {
        idx - 8: classname
        for idx, classname in idx2classnames.items() if idx in range(8, 16)
    }
    material_idx2classnames = {
        idx - 16: classname
        for idx, classname in idx2classnames.items() if idx in range(16, 24)
    }

    task_constraints: List[(int, int)] = allowed_transitions(
        constraint_type="BIOUL", labels=task_idx2classnames)
    process_constraints: List[(int, int)] = allowed_transitions(
        constraint_type="BIOUL", labels=process_idx2classnames)
    material_constraints: List[(int, int)] = allowed_transitions(
        constraint_type="BIOUL", labels=material_idx2classnames)

    embedder = VanillaEmbedder(embedding=EMBEDDING,
                               embedding_dim=EMBEDDING_DIM)

    if HAVE_CHARACTER_ENCODER:
        char_embedder = VanillaEmbedder(embedding=CHARACTER_EMBEDDING,
                                        embedding_dim=CHARACTER_EMBEDDING_DIM)
        char_encoder = CharLSTMEncoder(
            char_embedder=char_embedder,
            char_emb_dim=CHARACTER_EMBEDDING_DIM,
            hidden_dim=CHARACTER_ENCODER_HIDDEN_DIM,
            bidirectional=True,
            combine_strategy="concat",
        )
        embedder = ConcatEmbedders([embedder, char_encoder])
        EMBEDDING_DIM += 2 * CHARACTER_ENCODER_HIDDEN_DIM

    encoder = Lstm2SeqEncoder(
        emb_dim=EMBEDDING_DIM,
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
    )

    tagger = ScienceIETagger(
        rnn2seqencoder=encoder,
        hid_dim=2 * HIDDEN_DIM
        if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
        num_classes=NUM_CLASSES,
        task_constraints=task_constraints,
        process_constraints=process_constraints,
        material_constraints=material_constraints,
        device=DEVICE,
    )

    return (
        tagger,
        {
            "EMBEDDING_DIM":
            EMBEDDING_DIM,
            "VOCAB_SIZE":
            VOCAB_SIZE,
            "BATCH_SIZE":
            BATCH_SIZE,
            "HIDDEN_DIM":
            2 * HIDDEN_DIM
            if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
            "COMBINE_STRATEGY":
            COMBINE_STRATEGY,
            "BIDIRECTIONAL":
            BIDIRECTIONAL,
            "tokens":
            tokens,
            "labels":
            labels,
            "EXPECTED_HIDDEN_DIM":
            2 * HIDDEN_DIM
            if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM,
            "TIME_STEPS":
            NUM_TIME_STEPS,
            "NUM_CLASSES":
            NUM_CLASSES,
            "HAVE_CHAR_ENCODER":
            HAVE_CHARACTER_ENCODER,
            "char_tokens":
            char_tokens,
        },
    )
Exemple #13
0
def vanilla_embedder():
    embedding = nn.Embedding(1000, 100)
    embedder = VanillaEmbedder(embedding=embedding, embedding_dim=100)
    return embedder, {"EMBEDDING_DIM": 100, "VOCAB_SIZE": 1000}
Exemple #14
0
def setup_parscit_tagger(request):
    EMBEDDING_DIM = 100
    CHARACTER_EMBEDDING_DIM = 25
    VOCAB_SIZE = 1000
    BATCH_SIZE = 2
    HIDDEN_DIM = 1024
    CHARACTER_ENCODER_HIDDEN_DIM = 100
    NUM_TIME_STEPS = 10
    MAX_CHAR_LENGTH = 25
    CHAR_VOCAB_SIZE = 100
    BIDIRECTIONAL = request.param[0]
    COMBINE_STRATEGY = request.param[1]
    HAVE_CHARACTER_ENCODER = request.param[2]
    NUM_CLASSES = 5
    EMBEDDING = nn.Embedding.from_pretrained(
        torch.zeros([VOCAB_SIZE, EMBEDDING_DIM]))
    CHARACTER_EMBEDDING = nn.Embedding.from_pretrained(
        torch.zeros([CHAR_VOCAB_SIZE, CHARACTER_EMBEDDING_DIM]))
    tokens = np.random.randint(0,
                               VOCAB_SIZE - 1,
                               size=(BATCH_SIZE, NUM_TIME_STEPS))
    labels = np.random.randint(0,
                               NUM_CLASSES - 1,
                               size=(BATCH_SIZE, NUM_TIME_STEPS))
    char_tokens = np.random.randint(0,
                                    CHAR_VOCAB_SIZE - 1,
                                    size=(BATCH_SIZE, NUM_TIME_STEPS,
                                          MAX_CHAR_LENGTH))
    tokens = torch.LongTensor(tokens)
    labels = torch.LongTensor(labels)
    char_tokens = torch.LongTensor(char_tokens)

    embedder = VanillaEmbedder(embedding=EMBEDDING,
                               embedding_dim=EMBEDDING_DIM)
    if HAVE_CHARACTER_ENCODER:
        char_embedder = VanillaEmbedder(embedding=CHARACTER_EMBEDDING,
                                        embedding_dim=CHARACTER_EMBEDDING_DIM)
        char_encoder = CharLSTMEncoder(
            char_embedder=char_embedder,
            char_emb_dim=CHARACTER_EMBEDDING_DIM,
            hidden_dim=CHARACTER_ENCODER_HIDDEN_DIM,
            bidirectional=True,
            combine_strategy="concat",
        )
        embedder = ConcatEmbedders([embedder, char_encoder])
        EMBEDDING_DIM = EMBEDDING_DIM + (2 * CHARACTER_ENCODER_HIDDEN_DIM)

    encoder = Lstm2SeqEncoder(
        emb_dim=EMBEDDING_DIM,
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=False,
    )

    tagger = ParscitTagger(
        rnn2seqencoder=encoder,
        hid_dim=2 * HIDDEN_DIM
        if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
        num_classes=NUM_CLASSES,
    )

    return (
        tagger,
        {
            "EMBEDDING_DIM":
            EMBEDDING_DIM,
            "VOCAB_SIZE":
            VOCAB_SIZE,
            "BATCH_SIZE":
            BATCH_SIZE,
            "HIDDEN_DIM":
            2 * HIDDEN_DIM
            if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM,
            "COMBINE_STRATEGY":
            COMBINE_STRATEGY,
            "BIDIRECTIONAL":
            BIDIRECTIONAL,
            "tokens":
            tokens,
            "labels":
            labels,
            "EXPECTED_HIDDEN_DIM":
            2 * HIDDEN_DIM
            if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM,
            "TIME_STEPS":
            NUM_TIME_STEPS,
            "NUM_CLASSES":
            NUM_CLASSES,
            "HAVE_CHAR_ENCODER":
            HAVE_CHARACTER_ENCODER,
            "char_tokens":
            char_tokens,
        },
    )
Exemple #15
0
def get_bilstm_crf_infer(dirname: str):
    hyperparam_config_filepath = pathlib.Path(dirname, "config.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    MAX_NUM_WORDS = config.get("MAX_NUM_WORDS", None)
    MAX_LENGTH = config.get("MAX_LENGTH", None)
    MAX_CHAR_LENGTH = config.get("MAX_CHAR_LENGTH", None)
    VOCAB_STORE_LOCATION = config.get("VOCAB_STORE_LOCATION", None)
    DEBUG = config.get("DEBUG", None)
    DEBUG_DATASET_PROPORTION = config.get("DEBUG_DATASET_PROPORTION", None)
    EMBEDDING_TYPE = config.get("EMBEDDING_TYPE", None)
    EMBEDDING_DIMENSION = config.get("EMBEDDING_DIMENSION", None)
    HIDDEN_DIMENSION = config.get("HIDDEN_DIM", None)
    BIDIRECTIONAL = config.get("BIDIRECTIONAL", None)
    COMBINE_STRATEGY = config.get("COMBINE_STRATEGY", None)
    DEVICE = config.get("DEVICE", "cpu")
    NUM_CLASSES = config.get("NUM_CLASSES", None)
    MODEL_SAVE_DIR = config.get("MODEL_SAVE_DIR", None)
    model_filepath = pathlib.Path(MODEL_SAVE_DIR, "best_model.pt")
    CHAR_VOCAB_STORE_LOCATION = config.get("CHAR_VOCAB_STORE_LOCATION", None)
    CHAR_EMBEDDING_DIMENSION = config.get("CHAR_EMBEDDING_DIMENSION", None)
    USE_CHAR_ENCODER = config.get("USE_CHAR_ENCODER", None)
    CHAR_ENCODER_HIDDEN_DIM = config.get("CHAR_ENCODER_HIDDEN_DIM", None)
    DROPOUT = config.get("DROPOUT", 0.0)

    test_conll_filepath = pathlib.Path(DATA_DIR, "cora_conll.txt")

    test_dataset = ParscitDataset(
        filename=test_conll_filepath,
        dataset_type="test",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        max_char_length=MAX_CHAR_LENGTH,
        word_vocab_store_location=VOCAB_STORE_LOCATION,
        char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION,
        debug=DEBUG,
        debug_dataset_proportion=DEBUG_DATASET_PROPORTION,
        word_embedding_type=EMBEDDING_TYPE,
        word_embedding_dimension=EMBEDDING_DIMENSION,
        char_embedding_dimension=CHAR_EMBEDDING_DIMENSION,
        word_start_token="<SOS>",
        word_end_token="<EOS>",
        word_pad_token="<PAD>",
        word_unk_token="<UNK>",
        word_add_start_end_token=False,
    )

    embedding = test_dataset.word_vocab.load_embedding()
    embedding = nn.Embedding.from_pretrained(embedding)
    embedder = VanillaEmbedder(embedding=embedding,
                               embedding_dim=EMBEDDING_DIMENSION)

    char_embedding = test_dataset.char_vocab.load_embedding()
    char_embedding = nn.Embedding.from_pretrained(char_embedding)

    if USE_CHAR_ENCODER:
        char_embedder = VanillaEmbedder(embedding=char_embedding,
                                        embedding_dim=CHAR_EMBEDDING_DIMENSION)
        char_encoder = CharLSTMEncoder(
            char_embedder=char_embedder,
            char_emb_dim=CHAR_EMBEDDING_DIMENSION,
            hidden_dim=CHAR_ENCODER_HIDDEN_DIM,
            bidirectional=True,
            combine_strategy="concat",
        )
        embedder = ConcatEmbedders([embedder, char_encoder])

        EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM

    lstm2seqencoder = Lstm2SeqEncoder(
        emb_dim=EMBEDDING_DIMENSION,
        embedder=embedder,
        dropout_value=DROPOUT,
        hidden_dim=HIDDEN_DIMENSION,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        rnn_bias=True,
        device=torch.device(DEVICE),
    )
    model = ParscitTagger(
        rnn2seqencoder=lstm2seqencoder,
        num_classes=NUM_CLASSES,
        hid_dim=2 * HIDDEN_DIMENSION if BIDIRECTIONAL
        and COMBINE_STRATEGY == "concat" else HIDDEN_DIMENSION,
    )

    inference_client = ParscitInference(model=model,
                                        model_filepath=str(model_filepath),
                                        dataset=test_dataset)
    return inference_client
Exemple #16
0
def setup_engine_test_with_simple_classifier(request, tmpdir_factory):
    MAX_NUM_WORDS = 1000
    MAX_LENGTH = 50
    vocab_store_location = tmpdir_factory.mktemp("tempdir").join("vocab.json")
    DEBUG = True
    BATCH_SIZE = 1
    NUM_TOKENS = 3
    EMB_DIM = 300

    train_dataset = SectLabelDataset(
        filename=SECT_LABEL_FILE,
        dataset_type="train",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        word_vocab_store_location=vocab_store_location,
        debug=DEBUG,
        word_embedding_type="random",
        word_embedding_dimension=EMB_DIM,
    )

    validation_dataset = SectLabelDataset(
        filename=SECT_LABEL_FILE,
        dataset_type="valid",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        word_vocab_store_location=vocab_store_location,
        debug=DEBUG,
        word_embedding_type="random",
        word_embedding_dimension=EMB_DIM,
    )

    test_dataset = SectLabelDataset(
        filename=SECT_LABEL_FILE,
        dataset_type="test",
        max_num_words=MAX_NUM_WORDS,
        max_instance_length=MAX_LENGTH,
        word_vocab_store_location=vocab_store_location,
        debug=DEBUG,
        word_embedding_type="random",
        word_embedding_dimension=EMB_DIM,
    )

    VOCAB_SIZE = MAX_NUM_WORDS + len(train_dataset.word_vocab.special_vocab)
    NUM_CLASSES = train_dataset.get_num_classes()
    NUM_EPOCHS = 1
    embedding = Embedding.from_pretrained(torch.zeros([VOCAB_SIZE, EMB_DIM]))
    labels = torch.LongTensor([1])
    metric = PrecisionRecallFMeasure(
        idx2labelname_mapping=train_dataset.idx2classname)
    embedder = VanillaEmbedder(embedding_dim=EMB_DIM, embedding=embedding)
    encoder = BOW_Encoder(emb_dim=EMB_DIM,
                          embedder=embedder,
                          dropout_value=0,
                          aggregation_type="sum")
    tokens = np.random.randint(0,
                               VOCAB_SIZE - 1,
                               size=(BATCH_SIZE, NUM_TOKENS))
    tokens = torch.LongTensor(tokens)
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMB_DIM,
        num_classes=NUM_CLASSES,
        classification_layer_bias=False,
    )

    optimizer = optim.SGD(model.parameters(), lr=0.01)
    engine = Engine(
        model,
        train_dataset,
        validation_dataset,
        test_dataset,
        optimizer=optimizer,
        batch_size=BATCH_SIZE,
        save_dir=tmpdir_factory.mktemp("model_save"),
        num_epochs=NUM_EPOCHS,
        save_every=1,
        log_train_metrics_every=10,
        metric=metric,
        track_for_best=request.param,
    )

    options = {
        "MAX_NUM_WORDS": MAX_NUM_WORDS,
        "MAX_LENGTH": MAX_LENGTH,
        "BATCH_SIZE": BATCH_SIZE,
        "NUM_TOKENS": NUM_TOKENS,
        "EMB_DIM": EMB_DIM,
        "VOCAB_SIZE": VOCAB_SIZE,
        "NUM_CLASSES": NUM_CLASSES,
        "NUM_EPOCHS": NUM_EPOCHS,
    }

    return engine, tokens, labels, options
Exemple #17
0
    with open(os.path.join(EXP_DIR_PATH, "config.json"), "w") as fp:
        json.dump(config, fp)

    # load the word embeddings
    embeddings = train_dataset.word_vocab.load_embedding()
    embeddings = nn.Embedding.from_pretrained(embeddings, freeze=False)

    # instantiate the elmo embedder
    elmo_embedder = BowElmoEmbedder(
        layer_aggregation="sum",
        cuda_device_id=-1 if DEVICE == "cpu" else int(
            DEVICE.split("cuda:")[1]),
    )

    # instantiate the vanilla embedder
    vanilla_embedder = VanillaEmbedder(embedding=embeddings,
                                       embedding_dim=EMBEDDING_DIMENSION)

    # concat the embeddings
    embedder = ConcatEmbedders([vanilla_embedder, elmo_embedder])

    encoder = LSTM2VecEncoder(
        emb_dim=EMBEDDING_DIMENSION + 1024,
        embedder=embedder,
        hidden_dim=HIDDEN_DIMENSION,
        bidirectional=BIDIRECTIONAL,
        combine_strategy=COMBINE_STRATEGY,
        device=torch.device(DEVICE),
    )

    encoding_dim = (2 * HIDDEN_DIMENSION if BIDIRECTIONAL
                    and COMBINE_STRATEGY == "concat" else HIDDEN_DIMENSION)
Exemple #18
0
    NUM_CLASSES = train_dataset.get_num_classes()

    config["VOCAB_SIZE"] = VOCAB_SIZE
    config["NUM_CLASSES"] = NUM_CLASSES

    with open(os.path.join(EXP_DIR_PATH, "config.json"), "w") as fp:
        json.dump(config, fp)

    with open(os.path.join(EXP_DIR_PATH, "test_dataset_params.json"), "w") as fp:
        json.dump(test_dataset_params, fp)

    random_embeddings = train_dataset.word_vocab.load_embedding()
    random_embeddings = nn.Embedding.from_pretrained(random_embeddings, freeze=False)

    embedder = VanillaEmbedder(
        embedding_dim=EMBEDDING_DIMENSION, embedding=random_embeddings
    )
    encoder = LSTM2VecEncoder(
        emb_dim=EMBEDDING_DIMENSION,
        embedder=embedder,
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIMENSION,
        combine_strategy=COMBINE_STRATEGY,
        bidirectional=BIDIRECTIONAL,
        device=torch.device(DEVICE),
    )

    classiier_encoding_dim = 2 * HIDDEN_DIMENSION if BIDIRECTIONAL else HIDDEN_DIMENSION
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=classiier_encoding_dim,