def setup_simple_classifier(): BATCH_SIZE = 1 NUM_TOKENS = 3 EMB_DIM = 300 VOCAB_SIZE = 10 NUM_CLASSES = 3 embedding = Embedding.from_pretrained(torch.zeros([VOCAB_SIZE, EMB_DIM])) embedder = VanillaEmbedder(embedding_dim=EMB_DIM, embedding=embedding) labels = torch.LongTensor([[1]]) encoder = BOW_Encoder(emb_dim=EMB_DIM, embedder=embedder, dropout_value=0, aggregation_type="sum") tokens = np.random.randint(0, VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TOKENS)) tokens = torch.LongTensor(tokens) simple_classifier = SimpleClassifier( encoder=encoder, encoding_dim=EMB_DIM, num_classes=NUM_CLASSES, classification_layer_bias=False, ) iter_dict = {"tokens": tokens, "label": labels} return iter_dict, simple_classifier, BATCH_SIZE, NUM_CLASSES
def setup_zero_embeddings(request): EMB_DIM = 300 VOCAB_SIZE = 10 BATCH_SIZE = 10 aggregation_type = request.param[0] embedding_type = request.param[1] embedding = None if embedding_type == "zeros": embedding = torch.zeros([VOCAB_SIZE, EMB_DIM]) elif embedding_type == "ones": embedding = torch.ones([VOCAB_SIZE, EMB_DIM]) embedding = nn.Embedding.from_pretrained(embedding) embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMB_DIM) encoder = BOW_Encoder(emb_dim=EMB_DIM, embedder=embedder, aggregation_type=aggregation_type) tokens = np.random.randint(0, VOCAB_SIZE - 1, size=(BATCH_SIZE, EMB_DIM)) tokens = torch.LongTensor(tokens) iter_dict = {"tokens": tokens} options = { "EMB_DIM": EMB_DIM, "VOCAB_SIZE": VOCAB_SIZE, "BATCH_SIZE": BATCH_SIZE, "EMBEDDING_TYPE": embedding_type, "AGGREGATION_TYPE": aggregation_type, } return encoder, iter_dict, options
def setup_lstm2seqencoder(request): EMBEDDING_DIM = 100 VOCAB_SIZE = 1000 BATCH_SIZE = 2 HIDDEN_DIM = 1024 NUM_TIME_STEPS = 10 BIDIRECTIONAL = request.param[0] COMBINE_STRATEGY = request.param[1] NUM_LAYERS = request.param[2] EMBEDDING = nn.Embedding.from_pretrained( torch.zeros([VOCAB_SIZE, EMBEDDING_DIM])) tokens = np.random.randint(0, VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TIME_STEPS)) tokens = torch.LongTensor(tokens) embedder = VanillaEmbedder(embedding=EMBEDDING, embedding_dim=EMBEDDING_DIM) encoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIM, embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, num_layers=NUM_LAYERS, ) return ( encoder, { "EMBEDDING_DIM": EMBEDDING_DIM, "VOCAB_SIZE": VOCAB_SIZE, "BATCH_SIZE": BATCH_SIZE, "HIDDEN_DIM": HIDDEN_DIM, "COMBINE_STRATEGY": COMBINE_STRATEGY, "BIDIRECTIONAL": BIDIRECTIONAL, "tokens": tokens, "EXPECTED_HIDDEN_DIM": 2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM, "TIME_STEPS": NUM_TIME_STEPS, "NUM_LAYERS": NUM_LAYERS, }, )
def test_raises_error_on_wrong_combine_strategy(self, setup_lstm2vecencoder): with pytest.raises(AssertionError): encoder = LSTM2VecEncoder( emb_dim=300, embedder=VanillaEmbedder(nn.Embedding(10, 1024), embedding_dim=1024), combine_strategy="add", )
def setup_lstm2vecencoder(request): emb_dim = 300 time_steps = 10 vocab_size = 100 batch_size = 32 embedding = nn.Embedding.from_pretrained(torch.zeros([vocab_size, emb_dim])) hidden_dimension = 1024 combine_strategy = request.param[1] bidirectional = request.param[0] tokens = np.random.randint(0, vocab_size - 1, size=(batch_size, time_steps)) tokens = torch.LongTensor(tokens) iter_dict = {"tokens": tokens} embedder = VanillaEmbedder(embedding=embedding, embedding_dim=emb_dim) encoder = LSTM2VecEncoder( emb_dim=emb_dim, embedder=embedder, dropout_value=0.0, hidden_dim=hidden_dimension, bidirectional=bidirectional, combine_strategy=combine_strategy, rnn_bias=False, ) return ( encoder, { "emb_dim": emb_dim, "vocab_size": vocab_size, "hidden_dim": 2 * hidden_dimension if bidirectional and combine_strategy == "concat" else hidden_dimension, "bidirectional": False, "combine_strategy": combine_strategy, "tokens": tokens, "batch_size": batch_size, "iter_dict": iter_dict, }, )
def get_bow_lc_parsect_infer(dirname: str): exp_dirpath = pathlib.Path(dirname) hyperparam_config_filepath = exp_dirpath.joinpath("config.json") test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json") with open(hyperparam_config_filepath, "r") as fp: config = json.load(fp) with open(test_dataset_params, "r") as fp: test_dataset_args = json.load(fp) EMBEDDING_DIMENSION = config["EMBEDDING_DIMENSION"] MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"] VOCAB_SIZE = config["VOCAB_SIZE"] NUM_CLASSES = config["NUM_CLASSES"] model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt") embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIMENSION) embedder = VanillaEmbedder(embedding_dim=EMBEDDING_DIMENSION, embedding=embedding) encoder = BOW_Encoder( emb_dim=EMBEDDING_DIMENSION, embedder=embedder, dropout_value=0.0, aggregation_type="sum", ) model = SimpleClassifier( encoder=encoder, encoding_dim=EMBEDDING_DIMENSION, num_classes=NUM_CLASSES, classification_layer_bias=True, ) dataset = SectLabelDataset(**test_dataset_args) dataset.print_stats() parsect_inference = ClassificationInference(model=model, model_filepath=model_filepath, dataset=dataset) return parsect_inference
def embedder(): batch_size = 32 time_steps = 10 vocab_size = 3000 embedding_dim = 300 embedding = nn.Embedding.from_pretrained(embeddings=torch.rand( vocab_size, embedding_dim), freeze=False) tokens = torch.LongTensor( torch.randint(0, vocab_size, size=(batch_size, time_steps))) options = { "tokens": tokens, "embedding_size": embedding_dim, "batch_size": batch_size, "time_steps": time_steps, } embedder = VanillaEmbedder(embedding=embedding, embedding_dim=embedding_dim) return embedder, options
idx - 8: classname for idx, classname in idx2classnames.items() if idx in range(8, 16) } material_idx2classnames = { idx - 16: classname for idx, classname in idx2classnames.items() if idx in range(16, 24) } task_constraints = allowed_transitions(constraint_type="BIOUL", labels=task_idx2classnames) process_constraints = allowed_transitions(constraint_type="BIOUL", labels=process_idx2classnames) material_constraints = allowed_transitions(constraint_type="BIOUL", labels=material_idx2classnames) embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIMENSION) if USE_CHAR_ENCODER: char_embedder = VanillaEmbedder(embedding=char_embedding, embedding_dim=CHAR_EMBEDDING_DIMENSION) char_encoder = CharLSTMEncoder( char_emb_dim=CHAR_EMBEDDING_DIMENSION, char_embedder=char_embedder, bidirectional=True, hidden_dim=CHAR_ENCODER_HIDDEN_DIM, combine_strategy="concat", device=torch.device(DEVICE), ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM
def get_elmo_bilstm_lc_infer(dirname: str): exp_dirpath = pathlib.Path(dirname) hyperparam_config_filepath = exp_dirpath.joinpath("config.json") test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json") with open(hyperparam_config_filepath, "r") as fp: config = json.load(fp) with open(test_dataset_params, "r") as fp: test_dataset_args = json.load(fp) DEVICE = config["DEVICE"] EMBEDDING_DIM = config["EMBEDDING_DIMENSION"] VOCAB_SIZE = config["VOCAB_SIZE"] HIDDEN_DIM = config["HIDDEN_DIMENSION"] BIDIRECTIONAL = config["BIDIRECTIONAL"] COMBINE_STRATEGY = config["COMBINE_STRATEGY"] NUM_CLASSES = config["NUM_CLASSES"] MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"] model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt") embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM) elmo_embedder = BowElmoEmbedder( layer_aggregation="sum", cuda_device_id=-1 if DEVICE == "cpu" else int( DEVICE.split("cuda:")[1]), ) vanilla_embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIM) embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder]) encoder = LSTM2VecEncoder( emb_dim=EMBEDDING_DIM + 1024, embedder=embedders, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, device=torch.device(DEVICE), ) encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM) model = SimpleClassifier( encoder=encoder, encoding_dim=encoding_dim, num_classes=NUM_CLASSES, classification_layer_bias=True, ) dataset = SectLabelDataset(**test_dataset_args) inference = ClassificationInference(model=model, model_filepath=model_filepath, dataset=dataset) return inference
def setup_engine_once( config_dict: Dict[str, str], experiment_name: str, train_data_filepath: pathlib.Path, test_data_filepath: pathlib.Path, ): DEBUG = config_dict["DEBUG"] DEBUG_DATASET_PROPORTION = config_dict["DEBUG_DATASET_PROPORTION"] BATCH_SIZE = config_dict["BATCH_SIZE"] LEARNING_RATE = config_dict["LEARNING_RATE"] NUM_EPOCHS = config_dict["NUM_EPOCHS"] SAVE_EVERY = config_dict["SAVE_EVERY"] LOG_TRAIN_METRICS_EVERY = config_dict["LOG_TRAIN_METRICS_EVERY"] EMBEDDING_DIMENSION = config_dict["EMBEDDING_DIMENSION"] CHAR_EMBEDDING_DIMENSION = config_dict["CHAR_EMBEDDING_DIMENSION"] EMBEDDING_TYPE = config_dict["EMBEDDING_TYPE"] MAX_NUM_WORDS = config_dict["MAX_NUM_WORDS"] MAX_LENGTH = config_dict["MAX_LENGTH"] DEVICE = config_dict["DEVICE"] HIDDEN_DIM = config_dict["HIDDEN_DIM"] BIDIRECTIONAL = config_dict["BIDIRECTIONAL"] COMBINE_STRATEGY = config_dict["COMBINE_STRATEGY"] MAX_CHAR_LENGTH = config_dict["MAX_CHAR_LENGTH"] USE_CHAR_ENCODER = config_dict["USE_CHAR_ENCODER"] CHAR_ENCODER_HIDDEN_DIM = config_dict["CHAR_ENCODER_HIDDEN_DIM"] DROPOUT = config_dict["DROPOUT"] EXP_NAME = experiment_name EXP_DIR_PATH = os.path.join(OUTPUT_DIR, EXP_NAME) if not os.path.isdir(EXP_DIR_PATH): os.mkdir(EXP_DIR_PATH) MODEL_SAVE_DIR = os.path.join(EXP_DIR_PATH, "checkpoints") if not os.path.isdir(MODEL_SAVE_DIR): os.mkdir(MODEL_SAVE_DIR) VOCAB_STORE_LOCATION = os.path.join(EXP_DIR_PATH, "vocab.json") CHAR_VOCAB_STORE_LOCATION = os.path.join(EXP_DIR_PATH, "char_vocab.json") CAPITALIZATION_VOCAB_STORE_LOCATION = os.path.join( EXP_DIR_PATH, "capitalization_vocab.json") CAPITALIZATION_EMBEDDING_DIMENSION = 10 TENSORBOARD_LOGDIR = os.path.join(".", "runs", EXP_NAME) train_dataset = ParscitDataset( filename=str(train_data_filepath), dataset_type="train", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, captialization_vocab_store_location= CAPITALIZATION_VOCAB_STORE_LOCATION, capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) validation_dataset = ParscitDataset( filename=str(test_data_filepath), dataset_type="valid", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, captialization_vocab_store_location= CAPITALIZATION_VOCAB_STORE_LOCATION, capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) test_dataset = ParscitDataset( filename=str(test_data_filepath), dataset_type="test", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, captialization_vocab_store_location= CAPITALIZATION_VOCAB_STORE_LOCATION, capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) train_dataset.print_stats() validation_dataset.print_stats() test_dataset.print_stats() VOCAB_SIZE = train_dataset.word_vocab.get_vocab_len() NUM_CLASSES = train_dataset.get_num_classes() embedding = train_dataset.word_vocab.load_embedding() embedding = nn.Embedding.from_pretrained(embedding, freeze=False) char_embedding = train_dataset.char_vocab.load_embedding() char_embedding = nn.Embedding.from_pretrained(char_embedding, freeze=False) embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIMENSION) if USE_CHAR_ENCODER: char_embedder = VanillaEmbedder( embedding=char_embedding, embedding_dim=CHAR_EMBEDDING_DIMENSION) char_encoder = CharLSTMEncoder( char_emb_dim=CHAR_EMBEDDING_DIMENSION, char_embedder=char_embedder, bidirectional=True, hidden_dim=CHAR_ENCODER_HIDDEN_DIM, combine_strategy="concat", device=torch.device(DEVICE), ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM lstm2seqencoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIMENSION, embedder=embedder, dropout_value=DROPOUT, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=True, device=torch.device(DEVICE), ) model = ParscitTagger( rnn2seqencoder=lstm2seqencoder, num_classes=NUM_CLASSES, hid_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, ) optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE) metric = TokenClassificationAccuracy( idx2labelname_mapping=train_dataset.idx2classname) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer=optimizer, mode="max", factor=0.1, patience=2) engine = Engine( model=model, train_dataset=train_dataset, validation_dataset=validation_dataset, test_dataset=test_dataset, optimizer=optimizer, batch_size=BATCH_SIZE, save_dir=MODEL_SAVE_DIR, num_epochs=NUM_EPOCHS, save_every=SAVE_EVERY, log_train_metrics_every=LOG_TRAIN_METRICS_EVERY, tensorboard_logdir=TENSORBOARD_LOGDIR, device=torch.device(DEVICE), metric=metric, track_for_best="macro_fscore", lr_scheduler=scheduler, ) config_dict["VOCAB_STORE_LOCATION"] = VOCAB_STORE_LOCATION config_dict["CHAR_VOCAB_STORE_LOCATION"] = CHAR_VOCAB_STORE_LOCATION config_dict["MODEL_SAVE_DIR"] = MODEL_SAVE_DIR config_dict["VOCAB_SIZE"] = VOCAB_SIZE config_dict["NUM_CLASSES"] = NUM_CLASSES with open(os.path.join(f"{EXP_DIR_PATH}", "config.json"), "w") as fp: json.dump(config_dict, fp) return engine
def get_science_ie_infer(dirname: str): model_folder = pathlib.Path(dirname) hyperparam_config_filename = model_folder.joinpath("config.json") with open(hyperparam_config_filename, "r") as fp: config = json.load(fp) MAX_NUM_WORDS = config.get("MAX_NUM_WORDS", None) MAX_LENGTH = config.get("MAX_LENGTH", None) MAX_CHAR_LENGTH = config.get("MAX_CHAR_LENGTH", None) VOCAB_STORE_LOCATION = config.get("VOCAB_STORE_LOCATION", None) DEBUG = config.get("DEBUG", None) DEBUG_DATASET_PROPORTION = config.get("DEBUG_DATASET_PROPORTION", None) EMBEDDING_TYPE = config.get("EMBEDDING_TYPE", None) EMBEDDING_DIMENSION = config.get("EMBEDDING_DIMENSION", None) HIDDEN_DIMENSION = config.get("HIDDEN_DIM", None) BIDIRECTIONAL = config.get("BIDIRECTIONAL", None) COMBINE_STRATEGY = config.get("COMBINE_STRATEGY", None) DEVICE = config.get("DEVICE", "cpu") NUM_CLASSES = config.get("NUM_CLASSES", None) MODEL_SAVE_DIR = config.get("MODEL_SAVE_DIR", None) model_filepath = pathlib.Path(MODEL_SAVE_DIR, "best_model.pt") CHAR_VOCAB_STORE_LOCATION = config.get("CHAR_VOCAB_STORE_LOCATION", None) CHAR_EMBEDDING_DIMENSION = config.get("CHAR_EMBEDDING_DIMENSION", None) USE_CHAR_ENCODER = config.get("USE_CHAR_ENCODER", None) CHAR_ENCODER_HIDDEN_DIM = config.get("CHAR_ENCODER_HIDDEN_DIM", None) NUM_LAYERS = config.get("NUM_LAYERS", 1) DROPOUT = config.get("DROPOUT", 0.0) print(f"NUM_LAYERS", NUM_LAYERS) test_science_ie_conll_filepath = pathlib.Path(DATA_DIR, "dev_science_ie_conll.txt") test_dataset = ScienceIEDataset( filename=test_science_ie_conll_filepath, dataset_type="test", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) embedding = test_dataset.word_vocab.load_embedding() embedding = nn.Embedding.from_pretrained(embedding) char_embedding = test_dataset.char_vocab.load_embedding() char_embedding = nn.Embedding.from_pretrained(char_embedding) classnames2idx = ScienceIEDataset.get_classname2idx() idx2classnames = {idx: classname for classname, idx in classnames2idx.items()} task_idx2classnames = { idx: classname for idx, classname in idx2classnames.items() if idx in range(0, 8) } process_idx2classnames = { idx - 8: classname for idx, classname in idx2classnames.items() if idx in range(8, 16) } material_idx2classnames = { idx - 16: classname for idx, classname in idx2classnames.items() if idx in range(16, 24) } task_constraints = allowed_transitions( constraint_type="BIOUL", labels=task_idx2classnames ) process_constraints = allowed_transitions( constraint_type="BIOUL", labels=process_idx2classnames ) material_constraints = allowed_transitions( constraint_type="BIOUL", labels=material_idx2classnames ) embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIMENSION) if USE_CHAR_ENCODER: char_embedder = VanillaEmbedder( embedding=char_embedding, embedding_dim=CHAR_EMBEDDING_DIMENSION ) char_encoder = CharLSTMEncoder( char_embedder=char_embedder, char_emb_dim=CHAR_EMBEDDING_DIMENSION, bidirectional=True, hidden_dim=CHAR_ENCODER_HIDDEN_DIM, combine_strategy="concat", device=torch.device(DEVICE), ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM lstm2seqencoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIMENSION, embedder=embedder, dropout_value=DROPOUT, hidden_dim=HIDDEN_DIMENSION, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, num_layers=NUM_LAYERS, rnn_bias=True, device=torch.device(DEVICE), ) model = ScienceIETagger( rnn2seqencoder=lstm2seqencoder, num_classes=NUM_CLASSES, hid_dim=2 * HIDDEN_DIMENSION if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIMENSION, task_constraints=task_constraints, process_constraints=process_constraints, material_constraints=material_constraints, device=torch.device(DEVICE), ) inference_client = ScienceIEInference( model=model, model_filepath=str(model_filepath), dataset=test_dataset ) return inference_client
def setup_science_ie_tagger(request): EMBEDDING_DIM = 100 CHARACTER_EMBEDDING_DIM = 25 VOCAB_SIZE = 1000 BATCH_SIZE = 2 HIDDEN_DIM = 1024 CHARACTER_ENCODER_HIDDEN_DIM = 100 NUM_TIME_STEPS = 10 MAX_CHAR_LENGTH = 25 CHAR_VOCAB_SIZE = 100 BIDIRECTIONAL = request.param[0] COMBINE_STRATEGY = request.param[1] HAVE_CHARACTER_ENCODER = request.param[2] DEVICE = torch.device("cpu") NUM_CLASSES = 8 EMBEDDING = nn.Embedding.from_pretrained( torch.zeros([VOCAB_SIZE, EMBEDDING_DIM])) CHARACTER_EMBEDDING = nn.Embedding.from_pretrained( torch.zeros([CHAR_VOCAB_SIZE, CHARACTER_EMBEDDING_DIM])) tokens = np.random.randint(0, VOCAB_SIZE, size=(BATCH_SIZE, NUM_TIME_STEPS)) task_labels = np.random.randint(0, 8, size=(BATCH_SIZE, NUM_TIME_STEPS)) process_labels = np.random.randint(8, 16, size=(BATCH_SIZE, NUM_TIME_STEPS)) material_labels = np.random.randint(16, 24, size=(BATCH_SIZE, NUM_TIME_STEPS)) task_labels = torch.LongTensor(task_labels) process_labels = torch.LongTensor(process_labels) material_labels = torch.LongTensor(material_labels) labels = torch.cat([task_labels, process_labels, material_labels], dim=1) char_tokens = np.random.randint(0, CHAR_VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TIME_STEPS, MAX_CHAR_LENGTH)) tokens = torch.LongTensor(tokens) labels = torch.LongTensor(labels) char_tokens = torch.LongTensor(char_tokens) classnames2idx = ScienceIEDataset.get_classname2idx() idx2classnames = { idx: classname for classname, idx in classnames2idx.items() } task_idx2classnames = { idx: classname for idx, classname in idx2classnames.items() if idx in range(0, 8) } process_idx2classnames = { idx - 8: classname for idx, classname in idx2classnames.items() if idx in range(8, 16) } material_idx2classnames = { idx - 16: classname for idx, classname in idx2classnames.items() if idx in range(16, 24) } task_constraints: List[(int, int)] = allowed_transitions( constraint_type="BIOUL", labels=task_idx2classnames) process_constraints: List[(int, int)] = allowed_transitions( constraint_type="BIOUL", labels=process_idx2classnames) material_constraints: List[(int, int)] = allowed_transitions( constraint_type="BIOUL", labels=material_idx2classnames) embedder = VanillaEmbedder(embedding=EMBEDDING, embedding_dim=EMBEDDING_DIM) if HAVE_CHARACTER_ENCODER: char_embedder = VanillaEmbedder(embedding=CHARACTER_EMBEDDING, embedding_dim=CHARACTER_EMBEDDING_DIM) char_encoder = CharLSTMEncoder( char_embedder=char_embedder, char_emb_dim=CHARACTER_EMBEDDING_DIM, hidden_dim=CHARACTER_ENCODER_HIDDEN_DIM, bidirectional=True, combine_strategy="concat", ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIM += 2 * CHARACTER_ENCODER_HIDDEN_DIM encoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIM, embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, ) tagger = ScienceIETagger( rnn2seqencoder=encoder, hid_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, num_classes=NUM_CLASSES, task_constraints=task_constraints, process_constraints=process_constraints, material_constraints=material_constraints, device=DEVICE, ) return ( tagger, { "EMBEDDING_DIM": EMBEDDING_DIM, "VOCAB_SIZE": VOCAB_SIZE, "BATCH_SIZE": BATCH_SIZE, "HIDDEN_DIM": 2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, "COMBINE_STRATEGY": COMBINE_STRATEGY, "BIDIRECTIONAL": BIDIRECTIONAL, "tokens": tokens, "labels": labels, "EXPECTED_HIDDEN_DIM": 2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM, "TIME_STEPS": NUM_TIME_STEPS, "NUM_CLASSES": NUM_CLASSES, "HAVE_CHAR_ENCODER": HAVE_CHARACTER_ENCODER, "char_tokens": char_tokens, }, )
def vanilla_embedder(): embedding = nn.Embedding(1000, 100) embedder = VanillaEmbedder(embedding=embedding, embedding_dim=100) return embedder, {"EMBEDDING_DIM": 100, "VOCAB_SIZE": 1000}
def setup_parscit_tagger(request): EMBEDDING_DIM = 100 CHARACTER_EMBEDDING_DIM = 25 VOCAB_SIZE = 1000 BATCH_SIZE = 2 HIDDEN_DIM = 1024 CHARACTER_ENCODER_HIDDEN_DIM = 100 NUM_TIME_STEPS = 10 MAX_CHAR_LENGTH = 25 CHAR_VOCAB_SIZE = 100 BIDIRECTIONAL = request.param[0] COMBINE_STRATEGY = request.param[1] HAVE_CHARACTER_ENCODER = request.param[2] NUM_CLASSES = 5 EMBEDDING = nn.Embedding.from_pretrained( torch.zeros([VOCAB_SIZE, EMBEDDING_DIM])) CHARACTER_EMBEDDING = nn.Embedding.from_pretrained( torch.zeros([CHAR_VOCAB_SIZE, CHARACTER_EMBEDDING_DIM])) tokens = np.random.randint(0, VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TIME_STEPS)) labels = np.random.randint(0, NUM_CLASSES - 1, size=(BATCH_SIZE, NUM_TIME_STEPS)) char_tokens = np.random.randint(0, CHAR_VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TIME_STEPS, MAX_CHAR_LENGTH)) tokens = torch.LongTensor(tokens) labels = torch.LongTensor(labels) char_tokens = torch.LongTensor(char_tokens) embedder = VanillaEmbedder(embedding=EMBEDDING, embedding_dim=EMBEDDING_DIM) if HAVE_CHARACTER_ENCODER: char_embedder = VanillaEmbedder(embedding=CHARACTER_EMBEDDING, embedding_dim=CHARACTER_EMBEDDING_DIM) char_encoder = CharLSTMEncoder( char_embedder=char_embedder, char_emb_dim=CHARACTER_EMBEDDING_DIM, hidden_dim=CHARACTER_ENCODER_HIDDEN_DIM, bidirectional=True, combine_strategy="concat", ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIM = EMBEDDING_DIM + (2 * CHARACTER_ENCODER_HIDDEN_DIM) encoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIM, embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, ) tagger = ParscitTagger( rnn2seqencoder=encoder, hid_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, num_classes=NUM_CLASSES, ) return ( tagger, { "EMBEDDING_DIM": EMBEDDING_DIM, "VOCAB_SIZE": VOCAB_SIZE, "BATCH_SIZE": BATCH_SIZE, "HIDDEN_DIM": 2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, "COMBINE_STRATEGY": COMBINE_STRATEGY, "BIDIRECTIONAL": BIDIRECTIONAL, "tokens": tokens, "labels": labels, "EXPECTED_HIDDEN_DIM": 2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM, "TIME_STEPS": NUM_TIME_STEPS, "NUM_CLASSES": NUM_CLASSES, "HAVE_CHAR_ENCODER": HAVE_CHARACTER_ENCODER, "char_tokens": char_tokens, }, )
def get_bilstm_crf_infer(dirname: str): hyperparam_config_filepath = pathlib.Path(dirname, "config.json") with open(hyperparam_config_filepath, "r") as fp: config = json.load(fp) MAX_NUM_WORDS = config.get("MAX_NUM_WORDS", None) MAX_LENGTH = config.get("MAX_LENGTH", None) MAX_CHAR_LENGTH = config.get("MAX_CHAR_LENGTH", None) VOCAB_STORE_LOCATION = config.get("VOCAB_STORE_LOCATION", None) DEBUG = config.get("DEBUG", None) DEBUG_DATASET_PROPORTION = config.get("DEBUG_DATASET_PROPORTION", None) EMBEDDING_TYPE = config.get("EMBEDDING_TYPE", None) EMBEDDING_DIMENSION = config.get("EMBEDDING_DIMENSION", None) HIDDEN_DIMENSION = config.get("HIDDEN_DIM", None) BIDIRECTIONAL = config.get("BIDIRECTIONAL", None) COMBINE_STRATEGY = config.get("COMBINE_STRATEGY", None) DEVICE = config.get("DEVICE", "cpu") NUM_CLASSES = config.get("NUM_CLASSES", None) MODEL_SAVE_DIR = config.get("MODEL_SAVE_DIR", None) model_filepath = pathlib.Path(MODEL_SAVE_DIR, "best_model.pt") CHAR_VOCAB_STORE_LOCATION = config.get("CHAR_VOCAB_STORE_LOCATION", None) CHAR_EMBEDDING_DIMENSION = config.get("CHAR_EMBEDDING_DIMENSION", None) USE_CHAR_ENCODER = config.get("USE_CHAR_ENCODER", None) CHAR_ENCODER_HIDDEN_DIM = config.get("CHAR_ENCODER_HIDDEN_DIM", None) DROPOUT = config.get("DROPOUT", 0.0) test_conll_filepath = pathlib.Path(DATA_DIR, "cora_conll.txt") test_dataset = ParscitDataset( filename=test_conll_filepath, dataset_type="test", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) embedding = test_dataset.word_vocab.load_embedding() embedding = nn.Embedding.from_pretrained(embedding) embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIMENSION) char_embedding = test_dataset.char_vocab.load_embedding() char_embedding = nn.Embedding.from_pretrained(char_embedding) if USE_CHAR_ENCODER: char_embedder = VanillaEmbedder(embedding=char_embedding, embedding_dim=CHAR_EMBEDDING_DIMENSION) char_encoder = CharLSTMEncoder( char_embedder=char_embedder, char_emb_dim=CHAR_EMBEDDING_DIMENSION, hidden_dim=CHAR_ENCODER_HIDDEN_DIM, bidirectional=True, combine_strategy="concat", ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM lstm2seqencoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIMENSION, embedder=embedder, dropout_value=DROPOUT, hidden_dim=HIDDEN_DIMENSION, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=True, device=torch.device(DEVICE), ) model = ParscitTagger( rnn2seqencoder=lstm2seqencoder, num_classes=NUM_CLASSES, hid_dim=2 * HIDDEN_DIMENSION if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIMENSION, ) inference_client = ParscitInference(model=model, model_filepath=str(model_filepath), dataset=test_dataset) return inference_client
def setup_engine_test_with_simple_classifier(request, tmpdir_factory): MAX_NUM_WORDS = 1000 MAX_LENGTH = 50 vocab_store_location = tmpdir_factory.mktemp("tempdir").join("vocab.json") DEBUG = True BATCH_SIZE = 1 NUM_TOKENS = 3 EMB_DIM = 300 train_dataset = SectLabelDataset( filename=SECT_LABEL_FILE, dataset_type="train", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, word_vocab_store_location=vocab_store_location, debug=DEBUG, word_embedding_type="random", word_embedding_dimension=EMB_DIM, ) validation_dataset = SectLabelDataset( filename=SECT_LABEL_FILE, dataset_type="valid", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, word_vocab_store_location=vocab_store_location, debug=DEBUG, word_embedding_type="random", word_embedding_dimension=EMB_DIM, ) test_dataset = SectLabelDataset( filename=SECT_LABEL_FILE, dataset_type="test", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, word_vocab_store_location=vocab_store_location, debug=DEBUG, word_embedding_type="random", word_embedding_dimension=EMB_DIM, ) VOCAB_SIZE = MAX_NUM_WORDS + len(train_dataset.word_vocab.special_vocab) NUM_CLASSES = train_dataset.get_num_classes() NUM_EPOCHS = 1 embedding = Embedding.from_pretrained(torch.zeros([VOCAB_SIZE, EMB_DIM])) labels = torch.LongTensor([1]) metric = PrecisionRecallFMeasure( idx2labelname_mapping=train_dataset.idx2classname) embedder = VanillaEmbedder(embedding_dim=EMB_DIM, embedding=embedding) encoder = BOW_Encoder(emb_dim=EMB_DIM, embedder=embedder, dropout_value=0, aggregation_type="sum") tokens = np.random.randint(0, VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TOKENS)) tokens = torch.LongTensor(tokens) model = SimpleClassifier( encoder=encoder, encoding_dim=EMB_DIM, num_classes=NUM_CLASSES, classification_layer_bias=False, ) optimizer = optim.SGD(model.parameters(), lr=0.01) engine = Engine( model, train_dataset, validation_dataset, test_dataset, optimizer=optimizer, batch_size=BATCH_SIZE, save_dir=tmpdir_factory.mktemp("model_save"), num_epochs=NUM_EPOCHS, save_every=1, log_train_metrics_every=10, metric=metric, track_for_best=request.param, ) options = { "MAX_NUM_WORDS": MAX_NUM_WORDS, "MAX_LENGTH": MAX_LENGTH, "BATCH_SIZE": BATCH_SIZE, "NUM_TOKENS": NUM_TOKENS, "EMB_DIM": EMB_DIM, "VOCAB_SIZE": VOCAB_SIZE, "NUM_CLASSES": NUM_CLASSES, "NUM_EPOCHS": NUM_EPOCHS, } return engine, tokens, labels, options
with open(os.path.join(EXP_DIR_PATH, "config.json"), "w") as fp: json.dump(config, fp) # load the word embeddings embeddings = train_dataset.word_vocab.load_embedding() embeddings = nn.Embedding.from_pretrained(embeddings, freeze=False) # instantiate the elmo embedder elmo_embedder = BowElmoEmbedder( layer_aggregation="sum", cuda_device_id=-1 if DEVICE == "cpu" else int( DEVICE.split("cuda:")[1]), ) # instantiate the vanilla embedder vanilla_embedder = VanillaEmbedder(embedding=embeddings, embedding_dim=EMBEDDING_DIMENSION) # concat the embeddings embedder = ConcatEmbedders([vanilla_embedder, elmo_embedder]) encoder = LSTM2VecEncoder( emb_dim=EMBEDDING_DIMENSION + 1024, embedder=embedder, hidden_dim=HIDDEN_DIMENSION, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, device=torch.device(DEVICE), ) encoding_dim = (2 * HIDDEN_DIMENSION if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIMENSION)
NUM_CLASSES = train_dataset.get_num_classes() config["VOCAB_SIZE"] = VOCAB_SIZE config["NUM_CLASSES"] = NUM_CLASSES with open(os.path.join(EXP_DIR_PATH, "config.json"), "w") as fp: json.dump(config, fp) with open(os.path.join(EXP_DIR_PATH, "test_dataset_params.json"), "w") as fp: json.dump(test_dataset_params, fp) random_embeddings = train_dataset.word_vocab.load_embedding() random_embeddings = nn.Embedding.from_pretrained(random_embeddings, freeze=False) embedder = VanillaEmbedder( embedding_dim=EMBEDDING_DIMENSION, embedding=random_embeddings ) encoder = LSTM2VecEncoder( emb_dim=EMBEDDING_DIMENSION, embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIMENSION, combine_strategy=COMBINE_STRATEGY, bidirectional=BIDIRECTIONAL, device=torch.device(DEVICE), ) classiier_encoding_dim = 2 * HIDDEN_DIMENSION if BIDIRECTIONAL else HIDDEN_DIMENSION model = SimpleClassifier( encoder=encoder, encoding_dim=classiier_encoding_dim,