def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() # Now finally we can iterate through batches. loader = SimpleDataLoader(instances, 3) loader.index_with(vocab) for i, batch in enumerate(loader): lm_embeddings = elmo_bilm( batch["elmo"]["character_ids"]["elmo_tokens"]) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings["activations"][2], lm_embeddings["mask"]) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] assert lengths.tolist() == expected_lengths # get the expected embeddings and compare! expected_top_layer = [ expected_lm_embeddings[k][i] for k in range(3) ] for k in range(3): assert numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6, )
def benchmark_xlmr_mdl(): from allennlp.data import DataLoader from allennlp.training.util import evaluate xlmr = load_xlmr_coref_model() instances = xlmr.dataset_reader.load_dataset(testset) data_loader = SimpleDataLoader(instances, 1) data_loader.index_with(xlmr.model.vocab) start = time.time() metrics = evaluate(xlmr.model, data_loader) print('**XLM-R model**') print_speed_performance(start, num_sentences, num_tokens) print('Precision : ', metrics['coref_precision']) print('Recall : ', metrics['coref_recall']) print('F1 : ', metrics['coref_f1']) print('Mention Recall : ', metrics['mention_recall'])
train_data, dev_data = read_data(dataset_reader) vocab = build_vocab(train_data + dev_data) model = build_model(vocab) train_loader, dev_loader = build_data_loaders(train_data, dev_data) train_loader.index_with(vocab) dev_loader.index_with(vocab) # You obviously won't want to create a temporary file for your training # results, but for execution in binder for this guide, we need to do this. with tempfile.TemporaryDirectory() as serialization_dir: trainer = build_trainer(model, serialization_dir, train_loader, dev_loader) trainer.train() return model, dataset_reader # We've copied the training loop from an earlier example, with updated model # code, above in the Setup section. We run the training loop to get a trained # model. model, dataset_reader = run_training_loop() # Now we can evaluate the model on a new dataset. test_data = list(dataset_reader.read("quick_start/data/movie_review/test.tsv")) data_loader = SimpleDataLoader(test_data, 8) data_loader.index_with(model.vocab) results = evaluate(model, data_loader) print(results)
class TaggerTrainer: def __init__(self) -> None: self.config: Config = Config().parse_args(known_only=True) bert_token_indexers = PretrainedTransformerIndexer(model_name=self.config.model_name) reader = SequenceTaggingDatasetReader(token_indexers={"tokens": bert_token_indexers}) train_instances = list(reader.read(self.config.train_file)) dev_instances = list(reader.read(self.config.dev_file)) test_instances = list(reader.read(self.config.test_file)) self.vocab: Vocabulary = Vocabulary.from_instances(train_instances) # 2. init the data loader self.train_data_loader = SimpleDataLoader(train_instances, self.config.batch_size, shuffle=True) self.dev_data_loader = SimpleDataLoader(dev_instances, self.config.batch_size, shuffle=False) self.train_data_loader.index_with(self.vocab) self.dev_data_loader.index_with(self.vocab) # 3. init the model self.model = self.init_model() self.trainer = self.init_trainer() def init_crf_model(self) -> Model: """init crf tagger model """ # 1. import related modules from allennlp bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name) bert_text_field_embedder tagger = SimpleTagger( vocab=self.vocab, text_field_embedder=BasicTextFieldEmbedder( token_embedders={ 'tokens': bert_text_field_embedder } ), encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()), verbose_metrics=True, calculate_span_f1=True, label_encoding="BMES", ) tagger.to(device=self.config.device) return tagger def init_model(self) -> Model: """build the model Args: vocab (Vocabulary): the vocabulary of corpus Returns: Model: the final models """ bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name) bert_text_field_embedder tagger = SimpleTagger( vocab=self.vocab, text_field_embedder=BasicTextFieldEmbedder( token_embedders={ 'tokens': bert_text_field_embedder } ), encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()), verbose_metrics=True, calculate_span_f1=True, label_encoding="BMES", ) tagger.to(device=self.config.device) return tagger def init_trainer(self) -> Trainer: parameters = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad] optimizer = AdamOptimizer(parameters, lr=self.config.lr) # type: ignore trainer = GradientDescentTrainer( model=self.model, serialization_dir='./output', data_loader=self.train_data_loader, validation_data_loader=self.dev_data_loader, num_epochs=self.config.epoch, optimizer=optimizer, cuda_device=self.config.device, ) return trainer def train(self): self.trainer.train()
class TaggerTrainer: def __init__(self) -> None: self.config: Config = Config().parse_args(known_only=True) bert_token_indexers = PretrainedTransformerIndexer( model_name=self.config.model_name) bert_tokenizer = PretrainedTransformerTokenizer( model_name=self.config.model_name) reader = TextClassificationJsonReader( token_indexers={"tokens": bert_token_indexers}, tokenizer=bert_tokenizer) train_instances = list(reader.read(self.config.train_file)) dev_instances = list(reader.read(self.config.dev_file)) test_instances = list(reader.read(self.config.test_file)) self.vocab: Vocabulary = Vocabulary.from_instances(train_instances) # 2. init the data loader self.train_data_loader = SimpleDataLoader(train_instances, self.config.batch_size, shuffle=True) self.dev_data_loader = SimpleDataLoader(dev_instances, self.config.batch_size, shuffle=False) self.train_data_loader.index_with(self.vocab) self.dev_data_loader.index_with(self.vocab) # 3. init the model self.model = self.init_model() self.trainer = self.init_trainer() def init_model(self) -> Model: """build the model Args: vocab (Vocabulary): the vocabulary of corpus Returns: Model: the final models """ bert_text_field_embedder = PretrainedTransformerEmbedder( model_name=self.config.model_name) tagger = BasicClassifier( vocab=self.vocab, text_field_embedder=BasicTextFieldEmbedder( token_embedders={'tokens': bert_text_field_embedder}), seq2vec_encoder=ClsPooler( embedding_dim=bert_text_field_embedder.get_output_dim()), ) tagger.to(device=self.config.device) return tagger def init_trainer(self) -> Trainer: parameters = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad] group_parameter_group = [(['_text_field_embedder.*'], { 'lr': self.config.lr }), (['_classification_layer.*'], { 'lr': self.config.classifier_lr })] optimizer = AdamOptimizer(parameters, parameter_groups=group_parameter_group, lr=self.config.lr) # type: ignore trainer = GradientDescentTrainer( model=self.model, serialization_dir='./output', data_loader=self.train_data_loader, validation_data_loader=self.dev_data_loader, num_epochs=self.config.epoch, optimizer=optimizer, cuda_device=self.config.device, ) return trainer def train(self): self.trainer.train()