Beispiel #1
0
def read_data(reader: DatasetReader, tgt_domain: str, input_path: str,
              domains: List) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")

    training_data = None
    for domain in domains:
        if domain != tgt_domain:
            if training_data == None:
                training_data = reader.read(input_path + domain + '/' +
                                            domain + '_neg.txt')
            else:
                training_data += reader.read(input_path + domain + '/' +
                                             domain + '_neg.txt')

    valid_test_data = reader.read(input_path + tgt_domain + '/' + tgt_domain +
                                  '_neg.txt')

    as_per_percent = int(len(valid_test_data) * 0.25)
    valid_size = 2000 if as_per_percent >= 2000 else as_per_percent

    validation_data = valid_test_data[:valid_size]
    test_data = valid_test_data[valid_size:]

    training_data = AllennlpDataset(training_data)
    validation_data = AllennlpDataset(validation_data)
    test_data = AllennlpDataset(test_data)

    print("train:", len(training_data), "validation:", len(validation_data),
          "test:", len(test_data))
    return training_data, validation_data, test_data
def read_data(reader: DatasetReader) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    
    training_data = reader.read('../data/snips/utterances_train_features.txt')
    validation_data = reader.read('../data/snips/utterances_valid_features.txt')
    
    training_data = AllennlpDataset(training_data)
    validation_data = AllennlpDataset(validation_data)
    
    print("train:",len(training_data), "validation:", len(validation_data))
    return training_data, validation_data
Beispiel #3
0
    def test_from_params(self):
        dataset = AllennlpDataset(self.instances, self.vocab)
        params = Params({})

        sorting_keys = ["s1", "s2"]
        params["sorting_keys"] = sorting_keys
        params["max_tokens"] = 32
        sampler = MaxTokensBatchSampler.from_params(params=params,
                                                    data_source=dataset)

        assert sampler.sorting_keys == sorting_keys
        assert sampler.padding_noise == 0.1
        assert sampler.max_tokens == 32

        params = Params({
            "sorting_keys": sorting_keys,
            "padding_noise": 0.5,
            "max_tokens": 100
        })

        sampler = MaxTokensBatchSampler.from_params(params=params,
                                                    data_source=dataset)
        assert sampler.sorting_keys == sorting_keys
        assert sampler.padding_noise == 0.5
        assert sampler.max_tokens == 100
 def test_guess_sorting_key_picks_the_longest_key(self):
     dataset = AllennlpDataset(self.instances, vocab=self.vocab)
     sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0)
     instances = []
     short_tokens = [Token(t) for t in ["what", "is", "this", "?"]]
     long_tokens = [
         Token(t)
         for t in ["this", "is", "a", "not", "very", "long", "passage"]
     ]
     instances.append(
         Instance({
             "question": TextField(short_tokens, self.token_indexers),
             "passage": TextField(long_tokens, self.token_indexers),
         }))
     instances.append(
         Instance({
             "question": TextField(short_tokens, self.token_indexers),
             "passage": TextField(long_tokens, self.token_indexers),
         }))
     instances.append(
         Instance({
             "question": TextField(short_tokens, self.token_indexers),
             "passage": TextField(long_tokens, self.token_indexers),
         }))
     assert sampler.sorting_keys is None
     sampler._guess_sorting_keys(instances)
     assert sampler.sorting_keys == [("passage", "tokens___tokens")]
    def test_from_params(self):
        dataset = AllennlpDataset(self.instances, self.vocab)
        params = Params({})

        sorting_keys = [("s1", "nt"), ("s2", "nt2")]
        params["sorting_keys"] = sorting_keys
        params["batch_size"] = 32
        sampler = BucketBatchSampler.from_params(params=params,
                                                 data_source=dataset)

        assert sampler.sorting_keys == sorting_keys
        assert sampler.padding_noise == 0.1
        assert sampler.batch_size == 32

        params = Params({
            "sorting_keys": sorting_keys,
            "padding_noise": 0.5,
            "batch_size": 100,
            "drop_last": True,
        })

        sampler = BucketBatchSampler.from_params(params=params,
                                                 data_source=dataset)
        assert sampler.sorting_keys == sorting_keys
        assert sampler.padding_noise == 0.5
        assert sampler.batch_size == 100
        assert sampler.drop_last
Beispiel #6
0
    def test_batch_count(self):
        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
        sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"])
        # We use a custom collate_fn for testing, which doesn't actually create tensors,
        # just the allennlp Batches.
        dataloader = DataLoader(dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x))

        assert len(dataloader) == 3
Beispiel #7
0
    def test_batch_of_entirely_empty_lists_works(self):
        dataset = AllennlpDataset([self.empty_instance, self.empty_instance],
                                  self.vocab)

        model = DummyModel(self.vocab)
        model.eval()
        loader = DataLoader(dataset, batch_size=2)
        batch = next(iter(loader))
        model.forward(**batch)
    def test_from_params_in_trainer(self):
        # This is more of an integration test, making sure that a bunch of pieces fit together
        # correctly, but it matters most for this learning rate scheduler, so we're testing it here.
        params = Params(
            {
                "num_epochs": 5,
                "learning_rate_scheduler": {
                    "type": "slanted_triangular",
                    "gradual_unfreezing": True,
                    "discriminative_fine_tuning": True,
                    "decay_factor": 0.5,
                },
            }
        )
        # The method called in the logic below only checks the length of this list, not its
        # contents, so this should be safe.
        instances = AllennlpDataset([1] * 40)
        optim = self._get_optimizer()
        trainer = Trainer.from_params(
            model=self.model,
            optimizer=Lazy(lambda **kwargs: optim),
            serialization_dir=self.TEST_DIR,
            params=params,
            data_loader=DataLoader(instances, batch_size=10),
        )
        assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular)

        # This is what we wrote this test for: to be sure that num_epochs is passed correctly, and
        # that num_steps_per_epoch is computed and passed correctly.  This logic happens inside of
        # `Trainer.from_partial_objects`.
        assert trainer._learning_rate_scheduler.num_epochs == 5
        assert trainer._learning_rate_scheduler.num_steps_per_epoch == 4

        # And we'll do one more to make sure that we can override num_epochs in the scheduler if we
        # really want to.  Not sure why you would ever want to in this case; this is just testing
        # the functionality.
        params = Params(
            {
                "num_epochs": 5,
                "learning_rate_scheduler": {
                    "type": "slanted_triangular",
                    "num_epochs": 3,
                    "gradual_unfreezing": True,
                    "discriminative_fine_tuning": True,
                    "decay_factor": 0.5,
                },
            }
        )
        trainer = Trainer.from_params(
            model=self.model,
            optimizer=Lazy(lambda **kwargs: optim),
            serialization_dir=self.TEST_DIR,
            params=params,
            data_loader=DataLoader(instances, batch_size=10),
        )
        assert trainer._learning_rate_scheduler.num_epochs == 3
Beispiel #9
0
    def test_create_batches_groups_correctly(self):
        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
        sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"])

        grouped_instances = []
        for indices in sampler:
            grouped_instances.append([self.instances[idx] for idx in indices])
        assert grouped_instances == [
            [self.instances[4], self.instances[2]],
            [self.instances[0], self.instances[1]],
            [self.instances[3]],
        ]
Beispiel #10
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {"character_ids": indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()
        dataset = AllennlpDataset(instances, vocab)
        # Now finally we can iterate through batches.
        loader = DataLoader(dataset, 3)
        for i, batch in enumerate(loader):
            lm_embeddings = elmo_bilm(
                batch["elmo"]["character_ids"]["elmo_tokens"])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings["activations"][2], lm_embeddings["mask"])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                self.assertTrue(
                    numpy.allclose(
                        top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                        expected_top_layer[k],
                        atol=1.0e-6,
                    ))
Beispiel #11
0
    def test_drop_last_works(self):
        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
        sampler = BucketBatchSampler(
            dataset, batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True,
        )
        # We use a custom collate_fn for testing, which doesn't actually create tensors,
        # just the allennlp Batches.
        dataloader = DataLoader(dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x))
        batches = [batch for batch in iter(dataloader)]
        stats = self.get_batches_stats(batches)

        # all batches have length batch_size
        assert all(batch_len == 2 for batch_len in stats["batch_lengths"])

        # we should have lost one instance by skipping the last batch
        assert stats["total_instances"] == len(self.instances) - 1
    def _read(self, file_path: str, bagging:bool = False) -> Iterable[Instance]:

        ret = []
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        pseudo_tags = ["[pseudo1]","[pseudo2]","[pseudo3]","[pseudo4]","[pseudo5]", "[pseudo6]","[pseudo7]","[pseudo8]","[pseudo9]"][:num_virtual_models]
        
        # random.seed(i+1)
        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            data_file = list(data_file)
            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                    if not is_divider:
                        fields = [line.strip().split() for line in lines]

                        # print("a", fields)
                        # unzipping trick returns tuples, but our Fields need lists
                        fields = [list(field) for field in zip(*fields)]
                        tokens_, pos_tags, chunk_tags, ner_tags = fields
                        # TextField requires ``Token`` objects

                        if self.pseudo:
                            for i in range(len(pseudo_tags)):
                                pseudo_tokens = [pseudo_tags[i]] + tokens_

                                ner_tags_ = ["O"] + ner_tags

                                tokens = [Token(token) for token in pseudo_tokens]

                                ret.append(self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags_))
                        else:
                            tokens = tokens_

                            tokens = [Token(token) for token in tokens]

                            ret.append(self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags))

        if bagging:
            print("bagging sampling ....")
            ret = random.choices(ret, k = len(ret))
        return AllennlpDataset(ret)
Beispiel #13
0
    def test_create_batches_groups_correctly(self):
        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
        sampler = MaxTokensBatchSampler(dataset,
                                        max_tokens=8,
                                        padding_noise=0,
                                        sorting_keys=["text"])

        grouped_instances = []
        for indices in sampler:
            grouped_instances.append([self.instances[idx] for idx in indices])
        expected_groups = [
            [self.instances[4], self.instances[2]],
            [self.instances[0], self.instances[1]],
            [self.instances[3]],
        ]
        for group in grouped_instances:
            assert group in expected_groups
            expected_groups.remove(group)
        assert expected_groups == []
Beispiel #14
0
def main():
    # Load SNLI dataset

    bert_indexer = PretrainedTransformerIndexer('bert-base-uncased')
    tokenizer = PretrainedTransformerTokenizer(model_name='bert-base-uncased')
    reader = SnliReader(token_indexers={'tokens': bert_indexer},
                        tokenizer=tokenizer,
                        combine_input_fields=True)

    # single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer
    # tokenizer = WordTokenizer(end_tokens=["@@NULL@@"]) # add @@NULL@@ to the end of sentences
    # reader = SnliReader(token_indexers={'tokens': single_id_indexer}, tokenizer=tokenizer)
    dev_dataset = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl'
    )
    # Load model and vocab
    model_type = "pred"
    # model_type = "merged"
    if model_type == "merged":
        model = load_archive(
            '/home/junliw/gradient-regularization/SNLI/archives/bert_models/merged_model.tar.gz'
        ).model
    elif model_type == "pred":
        model = load_archive(
            '/home/junliw/gradient-regularization/SNLI/archives/bert_models/bert_trained2.tar.gz'
        ).model
    model.eval().cuda()
    vocab = model.vocab

    # add hooks for embeddings so we can compute gradients w.r.t. to the input tokens
    utils.add_hooks(model)

    if model_type == "merged":
        embedding_weight = model.combined_model._text_field_embedder._modules[
            "token_embedder_tokens"].transformer_model.embeddings.word_embeddings.weight  # save the word embedding matrix
    else:
        embedding_weight = model._text_field_embedder._modules[
            "token_embedder_tokens"].transformer_model.embeddings.word_embeddings.weight
    # print(model.combined_model._text_field_embedder._modules["token_embedder_tokens"].transformer_model.embeddings.word_embeddings)
    # print(embedding_weight.size())
    # Batches of examples to construct triggers
    universal_perturb_batch_size = 32

    # iterator = DataIterator(batch_size=universal_perturb_batch_size)
    # iterator.index_with(vocab)

    # Subsample the dataset to one class to do a universal attack on that class
    dataset_label_filter = 'entailment'  # only entailment examples
    # dataset_label_filter = 'contradiction' # only contradiction examples
    # dataset_label_filter = 'neutral' # only neutral examples
    subset_dev_dataset = []
    for instance in dev_dataset:
        if instance['label'].label == dataset_label_filter:
            subset_dev_dataset.append(instance)
    print(len(subset_dev_dataset))
    print(len(dev_dataset))
    # the attack is targeted towards a specific class
    # target_label = "0" # flip to entailment
    target_label = "1"  # flip to contradiction
    # target_label = "2" # flip to neutral

    # A k-d tree if you want to do gradient + nearest neighbors
    #tree = KDTree(embedding_weight.numpy())

    # Get original accuracy before adding universal triggers
    utils.get_accuracy(model,
                       subset_dev_dataset,
                       vocab,
                       tokenizer,
                       model_type,
                       trigger_token_ids=None,
                       snli=True)
    model.train()  # rnn cannot do backwards in train mode

    # Initialize triggers
    num_trigger_tokens = 2  # one token prepended
    start_tok = tokenizer.tokenizer.encode("a")[1]
    print(start_tok)
    trigger_token_ids = [start_tok] * num_trigger_tokens
    # sample batches, update the triggers, and repeat

    subset_dev_dataset_dataset = AllennlpDataset(dev_dataset, vocab)
    train_sampler = BucketBatchSampler(subset_dev_dataset_dataset,
                                       batch_size=universal_perturb_batch_size,
                                       sorting_keys=["tokens"])
    train_dataloader = DataLoader(subset_dev_dataset_dataset,
                                  batch_sampler=train_sampler)
    # for batch in lazy_groups_of(iterators(subset_dev_dataset, num_epochs=10, shuffle=True), group_size=1):
    for batch in train_dataloader:
        # get model accuracy with current triggers
        utils.get_accuracy(model,
                           subset_dev_dataset,
                           vocab,
                           tokenizer,
                           model_type,
                           trigger_token_ids,
                           snli=True)
        model.train()  # rnn cannot do backwards in train mode

        # get grad of triggers
        averaged_grad = utils.get_average_grad(model,
                                               batch,
                                               trigger_token_ids,
                                               target_label,
                                               snli=True)
        # find attack candidates using an attack method
        cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                        embedding_weight,
                                                        trigger_token_ids,
                                                        increase_loss=False,
                                                        num_candidates=40)
        print("------")
        print(cand_trigger_token_ids)
        # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
        #                                                trigger_token_ids,
        #                                                num_candidates=40)
        # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
        #                                                        embedding_weight,
        #                                                        trigger_token_ids,
        #                                                        tree,
        #                                                        100,
        #                                                        decrease_prob=True)
        # query the model to get the best candidates
        trigger_token_ids = utils.get_best_candidates(model,
                                                      batch,
                                                      trigger_token_ids,
                                                      cand_trigger_token_ids,
                                                      snli=True)
 def _batch_instances(self, instances: List[Instance], batch_size=None):
     batch_size = batch_size or len(instances)
     dataset = AllennlpDataset(instances)
     dataset.index_with(self._model.vocab)
     return DataLoader(dataset, batch_size=batch_size)
Beispiel #16
0
def main():
    # load the binary SST dataset.
    single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True)  # word tokenizer
    # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer},
                                                    use_subtrees=True)
    train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt')
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer})
    dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt')
    # test_dataset = reader.read('data/sst/test.txt')

    vocab = Vocabulary.from_instances(train_data)
    train_data.index_with(vocab)
    dev_data.index_with(vocab)

    # Randomly initialize vectors
    if EMBEDDING_TYPE == "None":
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
        word_embedding_dim = 300

    # Load word2vec vectors
    elif EMBEDDING_TYPE == "w2v":
        embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
        weight = _read_pretrained_embeddings_file(embedding_path,
                                                  embedding_dim=300,
                                                  vocab=vocab,
                                                  namespace="tokens")
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                    embedding_dim=300,
                                    weight=weight,
                                    trainable=False)
        word_embedding_dim = 300

    # Initialize model, cuda(), and optimizer
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim,
                                                  hidden_size=512,
                                                  num_layers=2,
                                                  batch_first=True))
    model = LstmClassifier(word_embeddings, encoder, vocab)
    model.cuda()

    # where to save the model
    model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th"
    vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab"
    # if the model already exists (its been trained), load the pre-trained weights and vocabulary
    if os.path.isfile(model_path):
        vocab = Vocabulary.from_files(vocab_path)
        model = LstmClassifier(word_embeddings, encoder, vocab)
        with open(model_path, 'rb') as f:
            model.load_state_dict(torch.load(f))
    # otherwise train model from scratch and save its weights
    else:
        train_sampler = BucketBatchSampler(train_data, batch_size=32, sorting_keys=[("tokens")])
        dev_sampler = BucketBatchSampler(dev_data, batch_size=32, sorting_keys=[("tokens")])
        train_loader = DataLoader(train_data, batch_sampler=train_sampler)
        dev_loader = DataLoader(dev_data, batch_sampler=dev_sampler)
        optimizer = optim.Adam(model.parameters())
        trainer = GradientDescentTrainer(model=model,
                                         optimizer=optimizer,
                                         data_loader=train_loader,
                                         validation_data_loader=dev_loader,
                                         num_epochs=5,
                                         patience=1,
                                         cuda_device=0)
        trainer.train()
        with open(model_path, 'wb') as f:
            torch.save(model.state_dict(), f)
        vocab.save_to_files(vocab_path)
    model.train().cuda()  # rnn cannot do backwards in eval mode

    # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings.
    # We use the gradient later in the attack.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(model)  # also save the word embedding matrix

    # Build k-d Tree if you are using gradient + nearest neighbor attack
    # tree = KDTree(embedding_weight.numpy())

    # filter the dataset to only positive or negative examples
    # (the trigger will cause the opposite prediction)
    dataset_label_filter = "0"
    targeted_dev_data = []
    for instance in dev_data:
        if instance['label'].label == dataset_label_filter:
            targeted_dev_data.append(instance)
    targeted_dev_data = AllennlpDataset(targeted_dev_data, vocab)

    # get accuracy before adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None)
    model.train()  # rnn cannot do backwards in eval mode

    # initialize triggers which are concatenated to the input
    num_trigger_tokens = 3
    trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens

    # Use batches of size universal_perturb_batch_size for the attacks.
    universal_perturb_batch_size = 128
    targeted_sampler = BasicBatchSampler(sampler=SequentialSampler(targeted_dev_data),
                                         batch_size=universal_perturb_batch_size,
                                         drop_last=False)  # TODO don't drop last
    targeted_loader = DataLoader(targeted_dev_data, batch_sampler=targeted_sampler)
    # sample batches, update the triggers, and repeat
    for epoch in range(5):
        for batch in targeted_loader:
            # get accuracy with current triggers
            utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
            model.train()  # rnn cannot do backwards in eval mode

            # get gradient w.r.t. trigger embeddings for current batch
            averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids)

            # pass the gradients to a particular attack to generate token candidates for each token.
            cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                            embedding_weight,
                                                            trigger_token_ids,
                                                            num_candidates=40,
                                                            increase_loss=True)
            # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
            #                                                trigger_token_ids,
            #                                                num_candidates=40)
            # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
            #                                                        embedding_weight,
            #                                                        trigger_token_ids,
            #                                                        tree,
            #                                                        100,
            #                                                        num_candidates=40,
            #                                                        increase_loss=True)

            # Tries all of the candidates and returns the trigger sequence with highest loss.
            trigger_token_ids = utils.get_best_candidates(model,
                                                          batch,
                                                          trigger_token_ids,
                                                          cand_trigger_token_ids)

    # print accuracy after adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
    def _read(self, file_path: str, bagging=False):
        if self.pseudo:
            print(
                "Now I added the pseudo tokens to the beginnning of soure!!!! not target!!!!!!"
            )

        pseudo_tags = [
            "[pseudo1]", "[pseudo2]", "[pseudo3]", "[pseudo4]", "[pseudo5]",
            "[pseudo6]", "[pseudo7]", "[pseudo8]", "[pseudo9]"
        ][:num_virtual_models]

        ret = []

        # Reset exceeded counts
        self._source_max_exceeded = 0
        self._target_max_exceeded = 0
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)

            data_file = list(data_file)

            if bagging:
                print("Executed bagging!!!!!!!!")
                data_file = random.choices(data_file, k=len(data_file))

            for line_num, row in enumerate(
                    csv.reader(data_file,
                               delimiter=self._delimiter,
                               quoting=self.quoting)):
                if len(row) != 2:
                    raise ConfigurationError(
                        "Invalid line format: %s (line number %d)" %
                        (row, line_num + 1))
                source_sequence, target_sequence = row
                if len(source_sequence) == 0 or len(target_sequence) == 0:
                    continue

                if self.pseudo:
                    for i in range(len(pseudo_tags)):
                        # pseudo_source_sequence = pseudo_tags[i] + " " + source_sequence
                        # pseudo_target_sequence = pseudo_tags[i] + " " + target_sequence
                        pseudo_source_sequence = source_sequence
                        pseudo_target_sequence = target_sequence
                        ret.append(
                            self.text_to_instance(pseudo_source_sequence,
                                                  pseudo_target_sequence,
                                                  v_i=i))
                else:
                    ret.append(
                        self.text_to_instance(source_sequence,
                                              target_sequence))

            print(f"num of longer than maximux length {self._70}")

        if self._source_max_tokens and self._source_max_exceeded:
            logger.info(
                "In %d instances, the source token length exceeded the max limit (%d) and were truncated.",
                self._source_max_exceeded,
                self._source_max_tokens,
            )
        if self._target_max_tokens and self._target_max_exceeded:
            logger.info(
                "In %d instances, the target token length exceeded the max limit (%d) and were truncated.",
                self._target_max_exceeded,
                self._target_max_tokens,
            )

        return AllennlpDataset(ret)
 def from_list_to_dataset(self, data):
     return AllennlpDataset(self._from_list_to_instance(data))
Beispiel #19
0
def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    extend_vocab: bool = False,
                    file_friendly_logging: bool = False,
                    batch_weight_key: str = "",
                    embedding_sources_mapping: Dict[str, str] = None,
                    in_fold = None,
                    num_folds = None,
                    ewc_weight=None) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    model : ``Model``
        A model to fine tune.
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment
    serialization_dir : ``str``
        The directory in which to save results and logs.
    extend_vocab: ``bool``, optional (default=False)
        If ``True``, we use the new instances to extend your vocabulary.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    batch_weight_key : ``str``, optional (default="")
        If non-empty, name of metric used to weight the loss on a per-batch basis.
    embedding_sources_mapping: ``Dict[str, str]``, optional (default=None)
        mapping from model paths to the pretrained embedding filepaths
        used during fine-tuning.
    """
    prepare_environment(params)
    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(f"Serialization directory ({serialization_dir}) "
                                 f"already exists and is not empty.")

    os.makedirs(serialization_dir, exist_ok=True)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning("You passed parameters for the model in your configuration file, but we "
                       "are ignoring them, using instead the model parameters in the archive.")

    vocabulary_params = params.pop('vocabulary', {})
    if vocabulary_params.get('directory_path', None):
        logger.warning("You passed `directory_path` in parameters for the vocabulary in "
                       "your configuration file, but it will be ignored. ")

    all_datasets = datasets_from_params(params)
    vocab = model.vocab

    if extend_vocab:
        datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
        vocab.extend_from_instances(vocabulary_params,
                                    (instance for key, dataset in all_datasets.items()
                                     for instance in dataset
                                     if key in datasets_for_vocab_creation))

        model.extend_embedder_vocab(embedding_sources_mapping)

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    dl_params = params.pop("data_loader")
    if test_data is not None:
        rand = random.Random(1234)
        test_data.index_with(vocab)
        shuffled_test = copy(test_data.instances)
        rand.shuffle(shuffled_test)
        extra_test = shuffled_test[:2000]

        keys = deepcopy(dl_params.as_dict())
        keys.update({"dataset": AllennlpDataset(extra_test, vocab)})
        extra_test_loader = DataLoader.from_params(params.pop("test_data_loader", keys))

        keys = deepcopy(dl_params.as_dict())
        keys.update({"dataset": test_data})
        test_loader = DataLoader.from_params(params.pop("test_data_loader", keys))

    master_model = model
    global_metrics = {}
    training_metrics = []
    final_metrics = {}
    master_trainer = trainer_params.as_dict()

    if num_folds is not None:

        rand = random.Random(1234)

        fold_train = []
        fold_test = []

        fold_train_loader = []
        fold_test_loader = []

        shuffled_instances = copy(train_data.instances)
        rand.shuffle(shuffled_instances)



        kfold = KFold(n_splits=num_folds, random_state=None, shuffle=False)
        computed_folds = list(kfold.split(shuffled_instances))

        for fold in range(num_folds):
            train_indexes, test_indexes = computed_folds[fold]
            new_train = [shuffled_instances[i] for i in train_indexes]
            new_test = [shuffled_instances[i] for i in test_indexes]
            fold_train.append(AllennlpDataset(new_train, vocab=vocab))
            fold_test.append(AllennlpDataset(new_test, vocab=vocab))

            keys = deepcopy(dl_params.as_dict())
            keys.update({"dataset": fold_test[-1]})
            fold_test_loader.append(DataLoader.from_params(params.pop("fold_test_data_loader",keys)))

            keys = deepcopy(dl_params.as_dict())
            keys.update({"dataset": fold_train[-1]})
            fold_train_loader.append(DataLoader.from_params(params.pop("fold_train_data_loader", keys)))

        for fold in ([in_fold] if in_fold is not None else range(num_folds)):
            fold_model = deepcopy(master_model)
            eval_epoch_callback = EvalEpochCallback(fold, fold_test_loader[fold], test_loader, global_metrics)
            callbacks = [eval_epoch_callback]
            if ewc_weight is not None:
                ewc = EWC(extra_test_loader)

                def ewc_forward(*args, **kwargs) -> Dict[str, torch.Tensor]:
                    ewc_loss = 0
                    if ewc.model.training:
                        ewc_loss = ewc.penalty(ewc.model)
                    ret = ewc.model.old_forward(*args, **kwargs)
                    ret["loss"] += ewc_weight * ewc_loss
                    return ret

                fold_model.old_forward = fold_model.forward
                fold_model.forward = ewc_forward
                callbacks.append(CallLossCallback(ewc))

            trainer = Trainer.from_params(model=fold_model,
                                          serialization_dir=serialization_dir,
                                          data_loader=fold_train_loader[fold],
                                          train_data=train_data,
                                          validation_data=None,
                                          params=Params(deepcopy(master_trainer)),
                                          validation_data_loader=None,
                                          epoch_callbacks=callbacks)

            training_metrics.append(trainer.train())
            del fold_model
            del trainer
            del eval_epoch_callback

            state = glob(serialization_dir+"/*.th")
            for file in state:
                logger.info("deleting state - {}".format(file))
                os.unlink(file)
    else:
        callbacks = []
        if ewc_weight is not None:
            ewc = EWC(extra_test_loader)

            def ewc_forward(*args, **kwargs) -> Dict[str, torch.Tensor]:
                ewc_loss = 0
                if ewc.model.training:
                    ewc_loss = ewc.penalty(ewc.model)
                ret = ewc.model.old_forward(*args, **kwargs)
                ret["loss"] += ewc_weight * ewc_loss
                return ret

            model.old_forward = model.forward
            model.forward = ewc_forward
            callbacks.append(CallLossCallback(ewc))

        keys = deepcopy(dl_params.as_dict())
        keys.update({"dataset": train_data})
        train_data.index_with(vocab)
        train_data_loader = DataLoader.from_params(params.pop("train_loader",keys))

        if validation_data is not None:
            validation_data.index_with(vocab)
            keys = deepcopy(dl_params.as_dict())
            keys.update({"dataset": validation_data})

            validation_data_loader = DataLoader.from_params(params.pop("validation_loader", keys))
        else:
            validation_data_loader = None

        if "finetune" in dir(model):
            model.finetune()
            logger.info("Fine tuning model")
        trainer = Trainer.from_params(model=model,
                                      serialization_dir=serialization_dir,
                                      data_loader=train_data_loader,
                                      train_data=train_data,
                                      validation_data=None,
                                      params=Params(deepcopy(master_trainer)),
                                      validation_data_loader=validation_data_loader,
                                      epoch_callbacks=callbacks)

        training_metrics = trainer.train()
        archive_model(serialization_dir)

    final_metrics["fine_tune"] = global_metrics
    final_metrics["training"] = training_metrics

    metrics_json = json.dumps(final_metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)
    return model
Beispiel #20
0
    def load_dataset(self, dataset: str):
        instances = self._multi_worker_islice(self._load(dataset))
        if not isinstance(instances, list):
            instances = list(instances)

        return AllennlpDataset(instances)