def _get_model(self) -> nn.Module: embedding_type = self.hparams.get("emb_type") word_embedder = WordEmbedder(embedding_type=embedding_type) elmo_embedder = ElmoEmbedder(datasets_manager=self.data_manager) embedder = ConcatEmbedders([word_embedder, elmo_embedder]) hidden_dim = self.hparams.get("hidden_dim") combine_strategy = self.hparams.get("combine_strategy") bidirectional = self.hparams.get("bidirectional") encoder = LSTM2VecEncoder( embedder=embedder, hidden_dim=hidden_dim, combine_strategy=combine_strategy, bidirectional=bidirectional, ) classifier_encoding_dim = 2 * hidden_dim if bidirectional else hidden_dim model = SimpleClassifier( encoder=encoder, encoding_dim=classifier_encoding_dim, num_classes=3, classification_layer_bias=True, datasets_manager=self.data_manager, ) return model
def _get_model(self): elmo_embedder = BowElmoEmbedder(layer_aggregation="sum") # instantiate the vanilla embedder vanilla_embedder = WordEmbedder( embedding_type=self.hparams.get("emb_type")) # concat the embeddings embedder = ConcatEmbedders([vanilla_embedder, elmo_embedder]) hidden_dim = self.hparams.get("hidden_dim") bidirectional = self.hparams.get("bidirectional") combine_strategy = self.hparams.get("combine_strategy") encoder = LSTM2VecEncoder( embedder=embedder, hidden_dim=hidden_dim, bidirectional=bidirectional, combine_strategy=combine_strategy, ) encoding_dim = (2 * hidden_dim if bidirectional and combine_strategy == "concat" else hidden_dim) model = SimpleClassifier( encoder=encoder, encoding_dim=encoding_dim, num_classes=23, classification_layer_bias=True, datasets_manager=self.data_manager, ) return model
def setup_lstm2vecencoder(request): hidden_dimension = 1024 combine_strategy = request.param[1] bidirectional = request.param[0] embedder = WordEmbedder(embedding_type="glove_6B_50") encoder = LSTM2VecEncoder( embedder=embedder, dropout_value=0.0, hidden_dim=hidden_dimension, bidirectional=bidirectional, combine_strategy=combine_strategy, rnn_bias=False, ) texts = ["First sentence", "second sentence"] lines = [] for text in texts: line = Line(text=text) lines.append(line) return ( encoder, { "hidden_dim": 2 * hidden_dimension if bidirectional and combine_strategy == "concat" else hidden_dimension, "bidirectional": False, "combine_strategy": combine_strategy, "lines": lines, }, )
def build_model(self): word_embedder = WordEmbedder( embedding_type=self.hparams.get("embedding_type"), device=self.hparams.get("device")) elmo_embedder = ElmoEmbedder(device=self.hparams.get("device")) embedder = ConcatEmbedders([word_embedder, elmo_embedder]) encoder = LSTM2VecEncoder( embedder=embedder, hidden_dim=self.hparams.get("hidden_dim"), combine_strategy=self.hparams.get("combine_strategy"), bidirectional=self.hparams.get("bidirectional"), device=torch.device(self.hparams.get("device")), ) classiier_encoding_dim = (2 * self.hparams.get("hidden_dim") if self.hparams.get("bidirectional") else self.hparams.get("hidden_dim")) model = SimpleClassifier( encoder=encoder, encoding_dim=classiier_encoding_dim, num_classes=self.hparams.get("num_classes"), classification_layer_bias=True, datasets_manager=self.data_manager, device=self.hparams.get("device"), ) return model
def test_raises_error_on_wrong_combine_strategy(self, setup_lstm2vecencoder): with pytest.raises(AssertionError): encoder = LSTM2VecEncoder( emb_dim=300, embedder=VanillaEmbedder(nn.Embedding(10, 1024), embedding_dim=1024), combine_strategy="add", )
def build_sectlabel_elmobilstm_model(dirname: str): exp_dirpath = pathlib.Path(dirname) DATA_PATH = pathlib.Path(DATA_DIR) train_file = DATA_PATH.joinpath("sectLabel.train") dev_file = DATA_PATH.joinpath("sectLabel.dev") test_file = DATA_PATH.joinpath("sectLabel.test") data_manager = TextClassificationDatasetManager( train_filename=str(train_file), dev_filename=str(dev_file), test_filename=str(test_file), ) DEVICE = "cpu" EMBEDDING_TYPE = "glove_6B_50" HIDDEN_DIM = 512 BIDIRECTIONAL = True COMBINE_STRATEGY = "concat" elmo_embedder = BowElmoEmbedder(cuda_device_id=-1 if DEVICE == "cpu" else int(DEVICE.split("cuda:")[1])) vanilla_embedder = WordEmbedder(embedding_type=EMBEDDING_TYPE) embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder]) encoder = LSTM2VecEncoder( embedder=embedders, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, device=torch.device(DEVICE), ) encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM) model = SimpleClassifier( encoder=encoder, encoding_dim=encoding_dim, num_classes=23, classification_layer_bias=True, datasets_manager=data_manager, ) inference = ClassificationInference( model=model, model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")), datasets_manager=data_manager, ) return inference
def setup_lstm2vecencoder(request): emb_dim = 300 time_steps = 10 vocab_size = 100 batch_size = 32 embedding = nn.Embedding.from_pretrained(torch.zeros([vocab_size, emb_dim])) hidden_dimension = 1024 combine_strategy = request.param[1] bidirectional = request.param[0] tokens = np.random.randint(0, vocab_size - 1, size=(batch_size, time_steps)) tokens = torch.LongTensor(tokens) iter_dict = {"tokens": tokens} embedder = VanillaEmbedder(embedding=embedding, embedding_dim=emb_dim) encoder = LSTM2VecEncoder( emb_dim=emb_dim, embedder=embedder, dropout_value=0.0, hidden_dim=hidden_dimension, bidirectional=bidirectional, combine_strategy=combine_strategy, rnn_bias=False, ) return ( encoder, { "emb_dim": emb_dim, "vocab_size": vocab_size, "hidden_dim": 2 * hidden_dimension if bidirectional and combine_strategy == "concat" else hidden_dimension, "bidirectional": False, "combine_strategy": combine_strategy, "tokens": tokens, "batch_size": batch_size, "iter_dict": iter_dict, }, )
def get_bilstm_lc_infer_parsect(dirname: str): exp_dirpath = pathlib.Path(dirname) hyperparam_config_filepath = exp_dirpath.joinpath("config.json") test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json") with open(hyperparam_config_filepath, "r") as fp: config = json.load(fp) with open(test_dataset_params, "r") as fp: test_dataset_args = json.load(fp) EMBEDDING_DIM = config["EMBEDDING_DIMENSION"] HIDDEN_DIM = config["HIDDEN_DIMENSION"] COMBINE_STRATEGY = config["COMBINE_STRATEGY"] BIDIRECTIONAL = config["BIDIRECTIONAL"] VOCAB_SIZE = config["VOCAB_SIZE"] NUM_CLASSES = config["NUM_CLASSES"] MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"] model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt") classifier_encoding_dim = 2 * HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM) embedder = VanillaEmbedder(embedding_dim=EMBEDDING_DIM, embedding=embedding) encoder = LSTM2VecEncoder( emb_dim=EMBEDDING_DIM, embedder=embedder, hidden_dim=HIDDEN_DIM, combine_strategy=COMBINE_STRATEGY, bidirectional=BIDIRECTIONAL, ) model = SimpleClassifier( encoder=encoder, encoding_dim=classifier_encoding_dim, num_classes=NUM_CLASSES, classification_layer_bias=True, ) dataset = SectLabelDataset(**test_dataset_args) inference = ClassificationInference(model=model, model_filepath=model_filepath, dataset=dataset) return inference
def build_sectlabel_bilstm_model(dirname: str): exp_dirpath = pathlib.Path(dirname) DATA_PATH = pathlib.Path(DATA_DIR) train_file = DATA_PATH.joinpath("sectLabel.train") dev_file = DATA_PATH.joinpath("sectLabel.dev") test_file = DATA_PATH.joinpath("sectLabel.test") data_manager = TextClassificationDatasetManager( train_filename=str(train_file), dev_filename=str(dev_file), test_filename=str(test_file), ) HIDDEN_DIM = 512 BIDIRECTIONAL = True COMBINE_STRATEGY = "concat" classifier_encoding_dim = 2 * HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM embedder = WordEmbedder(embedding_type="glove_6B_50") encoder = LSTM2VecEncoder( embedder=embedder, hidden_dim=HIDDEN_DIM, combine_strategy=COMBINE_STRATEGY, bidirectional=BIDIRECTIONAL, ) model = SimpleClassifier( encoder=encoder, encoding_dim=classifier_encoding_dim, num_classes=23, classification_layer_bias=True, datasets_manager=data_manager, ) inference = ClassificationInference( model=model, model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")), datasets_manager=data_manager, ) return inference
def __init__( self, char_embedder: nn.Module, char_emb_dim: int, hidden_dim: int = 1024, bidirectional: bool = False, combine_strategy: str = "concat", device: torch.device = torch.device("cpu"), ): """ Encodes character tokens using lstms Parameters ---------- char_embedder : nn.Module An embedder that embeds character tokens char_emb_dim : int The embedding of characters hidden_dim : int Hidden dimension of the LSTM bidirectional : bool Should the LSTM be bi-directional combine_strategy : str Combine strategy for the lstm hidden dimensions device : torch.device("cpu) The device on which the lstm will run """ super(CharLSTMEncoder, self).__init__() self.char_embedder = char_embedder self.char_emb_dim = char_emb_dim self.hidden_dim = hidden_dim self.bidirectional = bidirectional self.combine_strategy = combine_strategy self.device = device self.seq2vecencoder = LSTM2VecEncoder( embedder=self.char_embedder, emb_dim=char_emb_dim, hidden_dim=hidden_dim, bidirectional=bidirectional, combine_strategy=combine_strategy, rnn_bias=True, device=device, )
def build_model(self): embedder = WordEmbedder( embedding_type=self.hparams.get("embedding_type")) encoder = LSTM2VecEncoder( embedder=embedder, hidden_dim=self.hparams.get("hidden_dim"), combine_strategy=self.hparams.get("combine_strategy"), bidirectional=self.hparams.get("bidirectional"), ) model = SimpleClassifier( encoder=encoder, encoding_dim=2 * self.hparams.get("hidden_dim"), num_classes=self.hparams.get("num_classes"), classification_layer_bias=True, datasets_manager=self.data_manager, ) return model
layer_aggregation="sum", cuda_device_id=-1 if DEVICE == "cpu" else int( DEVICE.split("cuda:")[1]), ) # instantiate the vanilla embedder vanilla_embedder = VanillaEmbedder(embedding=embeddings, embedding_dim=EMBEDDING_DIMENSION) # concat the embeddings embedder = ConcatEmbedders([vanilla_embedder, elmo_embedder]) encoder = LSTM2VecEncoder( emb_dim=EMBEDDING_DIMENSION + 1024, embedder=embedder, hidden_dim=HIDDEN_DIMENSION, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, device=torch.device(DEVICE), ) encoding_dim = (2 * HIDDEN_DIMENSION if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIMENSION) model = SimpleClassifier( encoder=encoder, encoding_dim=encoding_dim, num_classes=NUM_CLASSES, classification_layer_bias=True, ) optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
DATA_PATH = pathlib.Path(DATA_DIR) train_file = DATA_PATH.joinpath("sectLabel.train") dev_file = DATA_PATH.joinpath("sectLabel.dev") test_file = DATA_PATH.joinpath("sectLabel.test") data_manager = TextClassificationDatasetManager( train_filename=str(train_file), dev_filename=str(dev_file), test_filename=str(test_file), ) embedder = WordEmbedder(embedding_type=args.emb_type, device=args.device) encoder = LSTM2VecEncoder( embedder=embedder, hidden_dim=args.hidden_dim, combine_strategy=args.combine_strategy, bidirectional=args.bidirectional, device=torch.device(args.device), ) classiier_encoding_dim = (2 * args.hidden_dim if args.bidirectional else args.hidden_dim) model = SimpleClassifier( encoder=encoder, encoding_dim=classiier_encoding_dim, num_classes=23, classification_layer_bias=True, datasets_manager=data_manager, device=args.device, )
def get_elmo_bilstm_lc_infer(dirname: str): exp_dirpath = pathlib.Path(dirname) hyperparam_config_filepath = exp_dirpath.joinpath("config.json") test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json") with open(hyperparam_config_filepath, "r") as fp: config = json.load(fp) with open(test_dataset_params, "r") as fp: test_dataset_args = json.load(fp) DEVICE = config["DEVICE"] EMBEDDING_DIM = config["EMBEDDING_DIMENSION"] VOCAB_SIZE = config["VOCAB_SIZE"] HIDDEN_DIM = config["HIDDEN_DIMENSION"] BIDIRECTIONAL = config["BIDIRECTIONAL"] COMBINE_STRATEGY = config["COMBINE_STRATEGY"] NUM_CLASSES = config["NUM_CLASSES"] MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"] model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt") embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM) elmo_embedder = BowElmoEmbedder( layer_aggregation="sum", cuda_device_id=-1 if DEVICE == "cpu" else int( DEVICE.split("cuda:")[1]), ) vanilla_embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIM) embedders = ConcatEmbedders([vanilla_embedder, elmo_embedder]) encoder = LSTM2VecEncoder( emb_dim=EMBEDDING_DIM + 1024, embedder=embedders, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, device=torch.device(DEVICE), ) encoding_dim = (2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM) model = SimpleClassifier( encoder=encoder, encoding_dim=encoding_dim, num_classes=NUM_CLASSES, classification_layer_bias=True, ) dataset = SectLabelDataset(**test_dataset_args) inference = ClassificationInference(model=model, model_filepath=model_filepath, dataset=dataset) return inference
def test_raises_error_on_wrong_combine_strategy(self, setup_lstm2vecencoder): with pytest.raises(AssertionError): encoder = LSTM2VecEncoder( embedder=WordEmbedder("glove_6B_50"), combine_strategy="add" )