Exemple #1
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {"character_ids": indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()
        # Now finally we can iterate through batches.
        loader = SimpleDataLoader(instances, 3)
        loader.index_with(vocab)
        for i, batch in enumerate(loader):
            lm_embeddings = elmo_bilm(
                batch["elmo"]["character_ids"]["elmo_tokens"])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings["activations"][2], lm_embeddings["mask"])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            assert lengths.tolist() == expected_lengths

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                assert numpy.allclose(
                    top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                    expected_top_layer[k],
                    atol=1.0e-6,
                )
Exemple #2
0
def benchmark_xlmr_mdl():

    from allennlp.data import DataLoader
    from allennlp.training.util import evaluate

    xlmr = load_xlmr_coref_model()

    instances = xlmr.dataset_reader.load_dataset(testset)
    data_loader = SimpleDataLoader(instances, 1)
    data_loader.index_with(xlmr.model.vocab)

    start = time.time()

    metrics = evaluate(xlmr.model, data_loader)

    print('**XLM-R model**')
    print_speed_performance(start, num_sentences, num_tokens)
    print('Precision : ', metrics['coref_precision'])
    print('Recall : ', metrics['coref_recall'])
    print('F1 : ', metrics['coref_f1'])
    print('Mention Recall : ', metrics['mention_recall'])
Exemple #3
0
    train_data, dev_data = read_data(dataset_reader)

    vocab = build_vocab(train_data + dev_data)
    model = build_model(vocab)

    train_loader, dev_loader = build_data_loaders(train_data, dev_data)
    train_loader.index_with(vocab)
    dev_loader.index_with(vocab)

    # You obviously won't want to create a temporary file for your training
    # results, but for execution in binder for this guide, we need to do this.
    with tempfile.TemporaryDirectory() as serialization_dir:
        trainer = build_trainer(model, serialization_dir, train_loader, dev_loader)
        trainer.train()

    return model, dataset_reader


# We've copied the training loop from an earlier example, with updated model
# code, above in the Setup section. We run the training loop to get a trained
# model.
model, dataset_reader = run_training_loop()

# Now we can evaluate the model on a new dataset.
test_data = list(dataset_reader.read("quick_start/data/movie_review/test.tsv"))
data_loader = SimpleDataLoader(test_data, 8)
data_loader.index_with(model.vocab)

results = evaluate(model, data_loader)
print(results)
Exemple #4
0
class TaggerTrainer:
    def __init__(self) -> None:
        self.config: Config = Config().parse_args(known_only=True)
        
        bert_token_indexers = PretrainedTransformerIndexer(model_name=self.config.model_name)
        reader = SequenceTaggingDatasetReader(token_indexers={"tokens": bert_token_indexers})

        train_instances = list(reader.read(self.config.train_file))
        dev_instances = list(reader.read(self.config.dev_file))
        test_instances = list(reader.read(self.config.test_file))

        self.vocab: Vocabulary = Vocabulary.from_instances(train_instances)

        # 2. init the data loader
        self.train_data_loader = SimpleDataLoader(train_instances, self.config.batch_size, shuffle=True)
        self.dev_data_loader = SimpleDataLoader(dev_instances, self.config.batch_size, shuffle=False)
        self.train_data_loader.index_with(self.vocab)
        self.dev_data_loader.index_with(self.vocab)
        
        # 3. init the model
        self.model = self.init_model()
        self.trainer = self.init_trainer()
    
    def init_crf_model(self) -> Model:
        """init crf tagger model
        """
        # 1. import related modules
        from allennlp
        bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name)
        bert_text_field_embedder
        tagger = SimpleTagger(
            vocab=self.vocab,
            text_field_embedder=BasicTextFieldEmbedder(
                token_embedders={
                    'tokens': bert_text_field_embedder
                }
            ),
            encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
            verbose_metrics=True,
            calculate_span_f1=True,
            label_encoding="BMES",
        )
        
        tagger.to(device=self.config.device)
        return tagger
    
    def init_model(self) -> Model:
        """build the model

        Args:
            vocab (Vocabulary): the vocabulary of corpus

        Returns:
            Model: the final models
        """
        bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name)
        bert_text_field_embedder
        tagger = SimpleTagger(
            vocab=self.vocab,
            text_field_embedder=BasicTextFieldEmbedder(
                token_embedders={
                    'tokens': bert_text_field_embedder
                }
            ),
            encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
            verbose_metrics=True,
            calculate_span_f1=True,
            label_encoding="BMES",
        )
        
        tagger.to(device=self.config.device)
        return tagger
    
    def init_trainer(self) -> Trainer:
        parameters = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad]
        optimizer = AdamOptimizer(parameters, lr=self.config.lr)  # type: ignore
        trainer = GradientDescentTrainer(
            model=self.model,
            serialization_dir='./output',
            data_loader=self.train_data_loader,
            validation_data_loader=self.dev_data_loader,
            num_epochs=self.config.epoch,
            optimizer=optimizer,
            cuda_device=self.config.device,
        )
        return trainer
    
    def train(self):
        self.trainer.train()
Exemple #5
0
class TaggerTrainer:
    def __init__(self) -> None:
        self.config: Config = Config().parse_args(known_only=True)

        bert_token_indexers = PretrainedTransformerIndexer(
            model_name=self.config.model_name)
        bert_tokenizer = PretrainedTransformerTokenizer(
            model_name=self.config.model_name)
        reader = TextClassificationJsonReader(
            token_indexers={"tokens": bert_token_indexers},
            tokenizer=bert_tokenizer)

        train_instances = list(reader.read(self.config.train_file))
        dev_instances = list(reader.read(self.config.dev_file))
        test_instances = list(reader.read(self.config.test_file))

        self.vocab: Vocabulary = Vocabulary.from_instances(train_instances)

        # 2. init the data loader
        self.train_data_loader = SimpleDataLoader(train_instances,
                                                  self.config.batch_size,
                                                  shuffle=True)
        self.dev_data_loader = SimpleDataLoader(dev_instances,
                                                self.config.batch_size,
                                                shuffle=False)
        self.train_data_loader.index_with(self.vocab)
        self.dev_data_loader.index_with(self.vocab)

        # 3. init the model
        self.model = self.init_model()
        self.trainer = self.init_trainer()

    def init_model(self) -> Model:
        """build the model

        Args:
            vocab (Vocabulary): the vocabulary of corpus

        Returns:
            Model: the final models
        """
        bert_text_field_embedder = PretrainedTransformerEmbedder(
            model_name=self.config.model_name)
        tagger = BasicClassifier(
            vocab=self.vocab,
            text_field_embedder=BasicTextFieldEmbedder(
                token_embedders={'tokens': bert_text_field_embedder}),
            seq2vec_encoder=ClsPooler(
                embedding_dim=bert_text_field_embedder.get_output_dim()),
        )
        tagger.to(device=self.config.device)
        return tagger

    def init_trainer(self) -> Trainer:
        parameters = [(n, p) for n, p in self.model.named_parameters()
                      if p.requires_grad]

        group_parameter_group = [(['_text_field_embedder.*'], {
            'lr': self.config.lr
        }), (['_classification_layer.*'], {
            'lr': self.config.classifier_lr
        })]

        optimizer = AdamOptimizer(parameters,
                                  parameter_groups=group_parameter_group,
                                  lr=self.config.lr)  # type: ignore

        trainer = GradientDescentTrainer(
            model=self.model,
            serialization_dir='./output',
            data_loader=self.train_data_loader,
            validation_data_loader=self.dev_data_loader,
            num_epochs=self.config.epoch,
            optimizer=optimizer,
            cuda_device=self.config.device,
        )
        return trainer

    def train(self):
        self.trainer.train()