def _get_model(self) -> nn.Module: word_embedder = TrainableWordEmbedder( embedding_type=self.hparams.get("emb_type"), datasets_manager=self.data_manager, ) elmo_embedder = BowElmoEmbedder(datasets_manager=self.data_manager, layer_aggregation="sum") embedder = ConcatEmbedders([word_embedder, elmo_embedder]) lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, hidden_dim=self.hparams.get("hidden_dim"), bidirectional=self.hparams.get("bidirectional"), combine_strategy=self.hparams.get("combine_strategy"), rnn_bias=True, dropout_value=self.hparams.get("lstm2seq_dropout", 0.0), add_projection_layer=False, ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=2 * self.hparams.get("hidden_dim") if self.hparams.get("bidirectional") and self.hparams.get("combine_strategy") == "concat" else self.hparams.get("hidden_dim"), datasets_manager=self.data_manager, ) return model
def build_model(self): embedder = TrainableWordEmbedder( embedding_type=self.hparams.get("emb_type"), datasets_manager=self.data_manager, device=self.hparams.get("device"), ) embedder = ConcatEmbedders([embedder]) lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=self.hparams.get("dropout"), hidden_dim=self.hparams.get("hidden_dim"), bidirectional=self.hparams.get("bidirectional"), combine_strategy=self.hparams.get("combine_strategy"), rnn_bias=True, device=self.hparams.get("device"), num_layers=self.hparams.get("num_layers"), ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=2 * self.hparams.get("hidden_dim") if self.hparams.get("bidirectional") and self.hparams.get("combine_strategy") == "concat" else self.hparams.get("hidden_dim"), device=self.hparams.get("device"), tagging_type="IOB1", datasets_manager=self.data_manager, ) return model
def setup_parscit_tagger(seq_dataset_manager): EMBEDDING_DIM = 100 HIDDEN_DIM = 100 BIDIRECTIONAL = True COMBINE_STRATEGY = "concat" dataset_manager = seq_dataset_manager embedder = WordEmbedder(embedding_type="glove_6B_50") char_embedder = CharEmbedder( char_embedding_dimension=10, hidden_dimension=20, datasets_manager=dataset_manager, ) embedder = ConcatEmbedders([embedder, char_embedder]) encoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, add_projection_layer=False, ) tagger = RnnSeqCrfTagger( rnn2seqencoder=encoder, encoding_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, datasets_manager=dataset_manager, ) return ( tagger, dataset_manager, { "EMBEDDING_DIM": EMBEDDING_DIM, "HIDDEN_DIM": 2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, "COMBINE_STRATEGY": COMBINE_STRATEGY, "BIDIRECTIONAL": BIDIRECTIONAL, "EXPECTED_HIDDEN_DIM": 2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM, }, )
def build_science_ie_model(dirname: str): exp_dirpath = pathlib.Path(dirname) data_dir = pathlib.Path(DATA_DIR) train_filename = data_dir.joinpath("train_science_ie_conll.txt") dev_filename = data_dir.joinpath("dev_science_ie_conll.txt") test_filename = data_dir.joinpath("dev_science_ie_conll.txt") data_manager = CoNLLDatasetManager( train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, column_names=["TASK", "PROCESS", "MATERIAL"], ) word_embedder = TrainableWordEmbedder( embedding_type="glove_6B_100", datasets_manager=data_manager ) char_embedder = CharEmbedder( char_embedding_dimension=20, hidden_dimension=25, datasets_manager=data_manager ) embedder = ConcatEmbedders([word_embedder, char_embedder]) lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, hidden_dim=350, bidirectional=True, combine_strategy="concat", rnn_bias=True, device=torch.device("cpu"), num_layers=2, ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=700, datasets_manager=data_manager, namespace_to_constraints=None, tagging_type="BIOUL", ) infer = SequenceLabellingInference( model=model, model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")), datasets_manager=data_manager, ) return infer
def build_ner_biobert_model(hparams): # data_dir = pathlib.Path(DATA_DIR) exp_dirpath = pathlib.Path(dirname) train_filename = "./ner/%s/%s.train"%(hparams.get("dataset"), hparams.get("dataset")) dev_filename = "./ner/%s/%s.dev"%(hparams.get("dataset"), hparams.get("dataset")) test_filename = "./ner/%s/%s.test"%(hparams.get("dataset"), hparams.get("dataset")) data_manager = BioNERDatasetManager( train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, column_names=["NER"], train_only="ner", ) config = transformers.BertConfig(output_hidden_states=True, vocab_size=28996) model = AutoModelWithLMHead.from_pretrained("monologg/biobert_v1.1_pubmed", config=config) tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.1_pubmed") # TODO: Specifying the max length biobert2seqencoder = Biobert2SeqEncoder( tokenizer=tokenizer, model=model, device=torch.device(hparams.get("device")), ) model = RnnSeqCrfTagger( rnn2seqencoder=biobert2seqencoder, encoding_dim=768, device=torch.device(hparams.get("device")), datasets_manager=data_manager, ) infer = SequenceLabellingInference( model=model, model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")), datasets_manager=data_manager, ) return infer
def build_model(self): word_embedder = TrainableWordEmbedder( embedding_type=self.hparams.get("emb_type"), datasets_manager=self.data_manager, ) char_embedder = CharEmbedder( char_embedding_dimension=self.hparams.get("char_emb_dim"), hidden_dimension=self.hparams.get("char_encoder_hidden_dim"), datasets_manager=self.data_manager, ) elmo_embedder = BowElmoEmbedder(datasets_manager=self.data_manager) embedder = ConcatEmbedders([word_embedder, char_embedder, elmo_embedder]) lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, hidden_dim=self.hparams.get("hidden_dim"), bidirectional=self.hparams.get("bidirectional"), combine_strategy=self.hparams.get("combine_strategy"), rnn_bias=True, ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=2 * self.hparams.get("hidden_dim") if self.hparams.get("bidirectional") and self.hparams.get("combine_strategy") == "concat" else self.hparams.get("hidden_dim"), datasets_manager=self.data_manager, ) self.printer.good("Finished Loading the Model") return model
lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=args.dropout, hidden_dim=args.hidden_dim, bidirectional=args.bidirectional, combine_strategy=args.combine_strategy, rnn_bias=True, device=torch.device(args.device), num_layers=args.num_layers, add_projection_layer=False, ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=2 * args.hidden_dim if args.bidirectional and args.combine_strategy == "concat" else args.hidden_dim, device=torch.device(args.device), tagging_type="BIOUL", datasets_manager=data_manager, ) optimizer = optim.Adam(params=model.parameters(), lr=args.lr, weight_decay=args.reg) train_metric = TokenClassificationAccuracy(datasets_manager=data_manager) dev_metric = TokenClassificationAccuracy(datasets_manager=data_manager) test_metric = TokenClassificationAccuracy(datasets_manager=data_manager) engine = Engine( model=model, datasets_manager=data_manager,
lstm2seqencoder = Lstm2SeqEncoder( embedder=word_embedder, dropout_value=args.dropout, hidden_dim=args.hidden_dim, bidirectional=args.bidirectional, combine_strategy=args.combine_strategy, rnn_bias=True, device=args.device, num_layers=args.num_layers, add_projection_layer=args.add_projection_layer, ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=2 * args.hidden_dim, device=args.device, tagging_type="BIOUL", datasets_manager=data_manager, include_start_end_trainsitions=False, ) optimizer = optim.Adam(params=model.parameters(), lr=args.lr) train_metric = ConLL2003Metrics(datasets_manager=data_manager) dev_metric = ConLL2003Metrics(datasets_manager=data_manager) test_metric = ConLL2003Metrics(datasets_manager=data_manager) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer=optimizer, factor=0.1, mode="max", patience=25,
def setup_parscit_inference(seq_dataset_manager, tmpdir_factory): HIDDEN_DIM = 100 BIDIRECTIONAL = True COMBINE_STRATEGY = "concat" dataset_manager = seq_dataset_manager embedder = WordEmbedder(embedding_type="glove_6B_50") char_embedder = CharEmbedder( char_embedding_dimension=10, hidden_dimension=20, datasets_manager=dataset_manager, ) embedder = ConcatEmbedders([embedder, char_embedder]) encoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, add_projection_layer=False, ) tagger = RnnSeqCrfTagger( rnn2seqencoder=encoder, encoding_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, datasets_manager=dataset_manager, ) train_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager) dev_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager) test_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager) optimizer = torch.optim.Adam(params=tagger.parameters()) batch_size = 1 save_dir = tmpdir_factory.mktemp("experiment_1") num_epochs = 1 save_every = 1 log_train_metrics_every = 10 engine = Engine( model=tagger, datasets_manager=dataset_manager, optimizer=optimizer, batch_size=batch_size, save_dir=save_dir, num_epochs=num_epochs, save_every=save_every, log_train_metrics_every=log_train_metrics_every, train_metric=train_metric, validation_metric=dev_metric, test_metric=test_metric, track_for_best="macro_fscore", ) engine.run() model_filepath = pathlib.Path(save_dir).joinpath("best_model.pt") inference_client = SequenceLabellingInference( model=tagger, model_filepath=model_filepath, datasets_manager=dataset_manager ) return inference_client