def test_from_params(self): dataset = AllennlpDataset(self.instances, self.vocab) params = Params({}) sorting_keys = [("s1", "nt"), ("s2", "nt2")] params["sorting_keys"] = sorting_keys params["batch_size"] = 32 sampler = BucketBatchSampler.from_params(params=params, data_source=dataset) assert sampler.sorting_keys == sorting_keys assert sampler.padding_noise == 0.1 assert sampler.batch_size == 32 params = Params({ "sorting_keys": sorting_keys, "padding_noise": 0.5, "batch_size": 100, "drop_last": True, }) sampler = BucketBatchSampler.from_params(params=params, data_source=dataset) assert sampler.sorting_keys == sorting_keys assert sampler.padding_noise == 0.5 assert sampler.batch_size == 100 assert sampler.drop_last
def test_guess_sorting_key_picks_the_longest_key(self): sampler = BucketBatchSampler(batch_size=2, padding_noise=0) instances = [] short_tokens = [Token(t) for t in ["what", "is", "this", "?"]] long_tokens = [ Token(t) for t in ["this", "is", "a", "not", "very", "long", "passage"] ] instances.append( Instance({ "question": TextField(short_tokens, self.token_indexers), "passage": TextField(long_tokens, self.token_indexers), })) instances.append( Instance({ "question": TextField(short_tokens, self.token_indexers), "passage": TextField(long_tokens, self.token_indexers), })) instances.append( Instance({ "question": TextField(short_tokens, self.token_indexers), "passage": TextField(long_tokens, self.token_indexers), })) assert sampler.sorting_keys is None sampler._guess_sorting_keys(instances) assert sampler.sorting_keys == ["passage"]
def build_data_loaders( train_data: torch.utils.data.Dataset, dev_data: torch.utils.data.Dataset = None, batch_size: int = 8): # Note that DataLoader is imported from allennlp above, *not* torch. # We need to get the allennlp-specific collate function, which is # what actually does indexing and batching. batch_sampler = BucketBatchSampler(train_data, batch_size=batch_size, sorting_keys=["text"], padding_noise=0) train_loader = DataLoader(train_data, batch_sampler=batch_sampler) if dev_data: dev_batch_sampler = BucketBatchSampler(dev_data, batch_size=batch_size, sorting_keys=["text"], padding_noise=0) dev_loader = DataLoader(dev_data, batch_sampler=dev_batch_sampler) return train_loader, dev_loader
def test_disable_shuffle(self): sampler = BucketBatchSampler(batch_size=2, sorting_keys=["text"], shuffle=False) grouped_instances = [] for indices in sampler.get_batch_indices(self.instances): grouped_instances.append([self.instances[idx] for idx in indices]) expected_groups = [ [self.instances[4], self.instances[2]], [self.instances[0], self.instances[1]], [self.instances[3]], ] for idx, group in enumerate(grouped_instances): assert group == expected_groups[idx]
def create_dataloader( dataset: InstancesDataset, batch_size: int, data_bucketing: bool = False, batches_per_epoch: Optional[int] = None, ) -> DataLoader: """Returns a pytorch DataLoader for AllenNLP Parameters ---------- dataset The data set for the DataLoader batch_size Size of the batch. data_bucketing If enabled, try to apply data bucketing over training batches. batches_per_epoch Determines the number of batches after which an epoch ends. If the number is smaller than the total amount of batches in your data, the second "epoch" will take off where the first "epoch" ended. If this is `None`, then an epoch is set to be one full pass through your data. Returns ------- data_loader """ return (DataLoader( dataset, batch_sampler=BucketBatchSampler(data_source=dataset, batch_size=batch_size), batches_per_epoch=batches_per_epoch, ) if data_bucketing and not isinstance(dataset, IterableDataset) else DataLoader(dataset, batch_size=batch_size, batches_per_epoch=batches_per_epoch))
def test_drop_last_works(self): sampler = BucketBatchSampler( batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True, ) # We use a custom collate_fn for testing, which doesn't actually create tensors, # just the allennlp Batches. def collate_fn(x, **kwargs): return Batch(x) data_loader = MultiProcessDataLoader( self.get_mock_reader(), "fake_path", batch_sampler=sampler, ) data_loader.collate_fn = collate_fn data_loader.index_with(self.vocab) batches = [batch for batch in iter(data_loader)] stats = self.get_batches_stats(batches) # all batches have length batch_size assert all(batch_len == 2 for batch_len in stats["batch_lengths"]) # we should have lost one instance by skipping the last batch assert stats["total_instances"] == len(self.instances) - 1
def test_create_batches_groups_correctly(self): sampler = BucketBatchSampler(batch_size=2, padding_noise=0, sorting_keys=["text"]) grouped_instances = [] for indices in sampler.get_batch_indices(self.instances): grouped_instances.append([self.instances[idx] for idx in indices]) expected_groups = [ [self.instances[4], self.instances[2]], [self.instances[0], self.instances[1]], [self.instances[3]], ] for group in grouped_instances: assert group in expected_groups expected_groups.remove(group) assert expected_groups == []
def val_dataloader(self): batch_sampler = BucketBatchSampler(self.val_dataset, batch_size=self.hparams['batch_size'], sorting_keys=['source_tokens', 'target_tokens'], drop_last=True) val_data_loader = DataLoader( self.val_dataset, batch_sampler=batch_sampler, num_workers=self.hparams['num_workers']) return val_data_loader
def test_batch_count(self): dataset = AllennlpDataset(self.instances, vocab=self.vocab) sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"]) # We use a custom collate_fn for testing, which doesn't actually create tensors, # just the allennlp Batches. dataloader = DataLoader(dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x)) assert len(dataloader) == 3
def test_batch_count(self): sampler = BucketBatchSampler(batch_size=2, padding_noise=0, sorting_keys=["text"]) data_loader = MultiProcessDataLoader(self.get_mock_reader(), "fake_path", batch_sampler=sampler) data_loader.index_with(self.vocab) assert len(data_loader) == 3
def main(): reader = StanfordSentimentTreeBankDatasetReader() train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt' dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt' sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"]) train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler) dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler) # You can optionally specify the minimum count of tokens/labels. # `min_count={'tokens':3}` here means that any tokens that appear less than three times # will be ignored and not included in the vocabulary. vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(), dev_data_loader.iter_instances()), min_count={'tokens': 3}) train_data_loader.index_with(vocab) dev_data_loader.index_with(vocab) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens, # not for labels, which are used as-is as the "answer" of the sentence classification word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) # Seq2VecEncoder is a neural network abstraction that takes a sequence of something # (usually a sequence of embedded word vectors), processes it, and returns a single # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but # AllenNLP also supports CNNs and other simple architectures (for example, # just averaging over the input vectors). encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) trainer = GradientDescentTrainer(model=model, optimizer=optimizer, data_loader=train_data_loader, validation_data_loader=dev_data_loader, patience=10, num_epochs=20, cuda_device=-1) trainer.train() predictor = SentenceClassifierPredictor(model, dataset_reader=reader) logits = predictor.predict('This is the best movie ever!')['logits'] label_id = np.argmax(logits) print(model.vocab.get_token_from_index(label_id, 'labels'))
def test_batch_count_with_drop_last(self): sampler = BucketBatchSampler( batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True, ) data_loader = MultiProcessDataLoader(self.get_mock_reader(), "fake_path", batch_sampler=sampler) assert len(data_loader) == 2
def test_create_batches_groups_correctly(self): dataset = AllennlpDataset(self.instances, vocab=self.vocab) sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"]) grouped_instances = [] for indices in sampler: grouped_instances.append([self.instances[idx] for idx in indices]) assert grouped_instances == [ [self.instances[4], self.instances[2]], [self.instances[0], self.instances[1]], [self.instances[3]], ]
def test_from_params(self): params = Params({}) sorting_keys = ["s1", "s2"] params["sorting_keys"] = sorting_keys params["batch_size"] = 32 sampler = BucketBatchSampler.from_params(params=params) assert sampler.sorting_keys == sorting_keys assert sampler.padding_noise == 0.1 assert sampler.batch_size == 32 params = Params({ "sorting_keys": sorting_keys, "padding_noise": 0.5, "batch_size": 100, "drop_last": True, }) sampler = BucketBatchSampler.from_params(params=params) assert sampler.sorting_keys == sorting_keys assert sampler.padding_noise == 0.5 assert sampler.batch_size == 100 assert sampler.drop_last
def test_drop_last_works(self): dataset = AllennlpDataset(self.instances, vocab=self.vocab) sampler = BucketBatchSampler( dataset, batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True, ) # We use a custom collate_fn for testing, which doesn't actually create tensors, # just the allennlp Batches. dataloader = DataLoader(dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x)) batches = [batch for batch in iter(dataloader)] stats = self.get_batches_stats(batches) # all batches have length batch_size assert all(batch_len == 2 for batch_len in stats["batch_lengths"]) # we should have lost one instance by skipping the last batch assert stats["total_instances"] == len(self.instances) - 1
def get_accuracy(model, dev_dataset, vocab, trigger_token_ids=None, snli=False): """ When trigger_token_ids is None, gets accuracy on the dev_dataset. Otherwise, gets accuracy with triggers prepended for the whole dev_dataset. """ model.get_metrics(reset=True) model.eval() # model should be in eval() already, but just in case data_loader = DataLoader(dev_dataset, batch_sampler=BucketBatchSampler(dev_dataset, batch_size=128)) if trigger_token_ids is None: for batch in data_loader: evaluate_batch(model, batch, trigger_token_ids, snli) print("Without Triggers: " + str(model.get_metrics()['accuracy'])) else: print_string = "" for idx in trigger_token_ids: print_string = print_string + vocab.get_token_from_index(idx) + ', ' for batch in data_loader: evaluate_batch(model, batch, trigger_token_ids, snli) print("Current Triggers: " + print_string + " : " + str(model.get_metrics()['accuracy']))
def run_training_loop(bert_model=None): # BUILDING DATA READER logging.info("Building data reader...") dataset_reader = build_data_reader(bert_model) logging.info("Reading data...") # These are a subclass of pytorch Datasets, with some allennlp-specific functionality added. train_instances = dataset_reader.read("/allennlp/data/train.tsv") logging.info(f"got {len(train_instances)} train instances") valid_instances = dataset_reader.read("/allennlp/data/dev.tsv") logging.info(f"got {len(valid_instances)} valid instances") logging.info("Building vocabulary...") vocab = Vocabulary.from_instances(train_instances + valid_instances, min_count={'text': 1}) # all tokens that appear at least once in namespace 'tokens' logging.info(vocab) # for namespace in vocab.get_namespaces(): # logging.info(f"vocab[{namespace}] size: {vocab.get_vocab_size(namespace=namespace)}") # return logging.info("Building model...") model = build_model(vocab, bert_model) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 logging.info(model) logging.info("Building data loaders...") # This is the allennlp-specific functionality in the Dataset object: # we need to be able convert strings in the data to integers. # this is how we do it. train_instances.index_with(vocab) valid_instances.index_with(vocab) # Using a BucketBatchSampler: # It sorts the instances by the length of their longest Field (or by any sorting keys you specify) # and automatically groups them so that instances of similar lengths get batched together. train_batch_sampler = BucketBatchSampler(train_instances, batch_size=8, sorting_keys=['text']) # sort by length of instance field 'text' valid_batch_sampler = BucketBatchSampler(valid_instances, batch_size=8, sorting_keys=['text']) # sort by length of instance field 'text' # These are again a subclass of pytorch DataLoaders, with an allennlp-specific collate function, # that runs our indexing and batching code. # Note that DataLoader is imported from allennlp above, *not* torch. # We need to get the allennlp-specific collate function, which is what actually does indexing and batching. #train_loader = DataLoader(train_instances, batch_size=8, shuffle=True) #valid_loader = DataLoader(valid_instances, batch_size=8, shuffle=False) train_loader = DataLoader(train_instances, batch_sampler=train_batch_sampler) #, shuffle=True) valid_loader = DataLoader(valid_instances, batch_sampler=valid_batch_sampler) #, shuffle=False) logging.info("Building trainer...") trainer = build_trainer(model, "/allennlp/models/tmp", train_loader, valid_loader, hugging_optim=bert_model is not None, cuda_device=cuda_device) logging.info("Start training...") trainer.train() logging.info("done.") ''' logging.info("====================") logging.info("Loading test data...") test_instances = dataset_reader.read("/allennlp/data/test.tsv") test_instances.index_with(vocab) test_loader = DataLoader(test_instances, batch_size=8, shuffle=False) logging.info("Predicting on test set...") # utility function to run your model and get the metric on the test set results = evaluate(model, test_loader) logging.info(results) ''' # from: https://guide.allennlp.org/training-and-prediction#4 logging.info("====================") logging.info("Constructing a Predictor for custom inputs...") predictor = SentenceClassifierPredictor(model, dataset_reader) for sent in ['A good movie!', 'This was a monstrous waste of time.']: logging.info("") logging.info(f"Predicting for '{sent}'...") output = predictor.predict(sent) for label_id, prob in enumerate(output['probs']): # ['probs'] coming from model's forward() function # Because the returned result (output['probs']) is just an array of probabilities for class labels, # we use vocab.get_token_from_index() to convert a label ID back to its label string. logging.info(f"{vocab.get_token_from_index(label_id, 'labels')}: {prob}") logging.info("done.")
def train(self): if self.config.adjust_point: ram_set_flag("adjust_point") # ram_write('dist_reg', self.config.dist_reg) read_hyper_ = partial(read_hyper, self.config.task_id, self.config.arch) num_epochs = int(read_hyper_("num_epochs")) batch_size = int(read_hyper_("batch_size")) logger.info(f"num_epochs: {num_epochs}, batch_size: {batch_size}") if self.config.model_name == 'tmp': p = pathlib.Path('saved/models/tmp') if p.exists(): shutil.rmtree(p) # Maybe we will do some data augmentation here. if self.config.aug_data != '': log(f'Augment data from {self.config.aug_data}') aug_data = auto_create( f"{self.config.task_id}.{self.config.arch}.aug", lambda: self.reader.read(self.config.aug_data), cache=True) self.train_data.instances.extend(aug_data.instances) # Set up the adversarial training policy if self.config.arch == 'bert': model_vocab = embed_util.get_bert_vocab() else: model_vocab = self.vocab # yapf: disable adv_field = 'sent2' if is_sentence_pair(self.config.task_id) and self.config.arch != 'bert' else 'sent' policy_args = { "adv_iteration": self.config.adv_iter, "replace_num": self.config.adv_replace_num, "searcher": WordIndexSearcher( CachedWordSearcher( "external_data/ibp-nbrs.json" if not self.config.big_nbrs else "external_data/euc-top8.json", model_vocab.get_token_to_index_vocabulary("tokens"), second_order=False ), word2idx=model_vocab.get_token_index, idx2word=model_vocab.get_token_from_index, ), 'adv_field': adv_field } # yapf: enable if self.config.adv_policy == 'hot': if is_sentence_pair( self.config.task_id) and self.config.arch != 'bert': policy_args['forward_order'] = 1 adv_policy = adv_utils.HotFlipPolicy(**policy_args) elif self.config.adv_policy == 'rdm': adv_policy = adv_utils.RandomNeighbourPolicy(**policy_args) elif self.config.adv_policy == 'diy': adv_policy = adv_utils.DoItYourselfPolicy(self.config.adv_iter, adv_field, self.config.adv_step) else: adv_policy = adv_utils.NoPolicy # A collate_fn will do some transformation an instance before # fed into a model. If we want to train a model with some transformations # such as cropping/DAE, we can modify code here. e.g., # collate_fn = partial(transform_collate, self.vocab, self.reader, Crop(0.3)) collate_fn = allennlp_collate train_data_sampler = BucketBatchSampler( data_source=self.train_data, batch_size=batch_size, ) # Set callbacks if self.config.task_id == 'SNLI' and self.config.arch != 'bert': epoch_callbacks = [] if self.config.model_pretrain != "": epoch_callbacks = [WarmupCallback(2)] if self.config.model_pretrain == 'auto': self.config.model_pretrain = { "biboe": "SNLI-fix-biboe-sum", "datt": "SNLI-fix-datt" }[self.config.arch] logger.warning( f"Try loading weights from pretrained model {self.config.model_pretrain}" ) pretrain_ckpter = CheckpointerX( f"saved/models/{self.config.model_pretrain}") self.model.load_state_dict(pretrain_ckpter.best_model_state()) else: epoch_callbacks = [] # epoch_callbacks = [] batch_callbacks = [] opt = self.model.get_optimizer() if self.config.arch == 'bert': scl = SlantedTriangular(opt, num_epochs, len(self.train_data) // batch_size) else: scl = None trainer = AdvTrainer( model=self.model, optimizer=opt, learning_rate_scheduler=scl, validation_metric='+accuracy', adv_policy=adv_policy, data_loader=DataLoader( self.train_data, batch_sampler=train_data_sampler, collate_fn=collate_fn, ), validation_data_loader=DataLoader( self.dev_data, batch_size=batch_size, ), num_epochs=num_epochs, patience=None, grad_clipping=1., cuda_device=0, epoch_callbacks=epoch_callbacks, batch_callbacks=batch_callbacks, serialization_dir=f'saved/models/{self.config.model_name}', num_serialized_models_to_keep=20) trainer.train()
def main(): reader = StanfordSentimentTreeBankDatasetReader() s3_prefix = 'https://s3.amazonaws.com/realworldnlpbook/data' # train_dataset = reader.read(f'{s3_prefix}/stanfordSentimentTreebank/trees/train.txt') # dev_dataset = reader.read(f'{s3_prefix}/stanfordSentimentTreebank/trees/dev.txt') train_dataset = reader.read('Treebank_train.txt') print(type(train_dataset)) print(train_dataset) dev_dataset = reader.read('Treebank_dev.txt') # You can optionally specify the minimum count of tokens/labels. # `min_count={'tokens':3}` here means that any tokens that appear less than three times # will be ignored and not included in the vocabulary. # 您可以选择指定令牌 / 标签的最小计数。 # 'min_count = {tokens:3}' # 这里的意思是任何出现少于三次的标记都将被忽略,并且不会包含在词汇表中。 vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens, # not for labels, which are used as-is as the "answer" of the sentence classification # BasicTextFieldEmbedder需要一个dict-我们需要一个仅用于令牌的嵌入, # 不适用于标签,它被用作句子分类的“答案” word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) # Seq2VecEncoder is a neural network abstraction that takes a sequence of something # (usually a sequence of embedded word vectors), processes it, and returns a single # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but # AllenNLP also supports CNNs and other simple architectures (for example, # just averaging over the input vectors). # Seq2VecEncoder是一个神经网络抽象,它需要一系列的东西 # (通常是一系列嵌入的词向量),处理它,并返回一个 # 矢量。通常这是基于RNN的体系结构(例如,LSTM或GRU),但是 # AllenNLP还支持cnn和其他简单的体系结构(例如, # 对输入向量求平均值)。 encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) train_dataset.index_with(vocab) dev_dataset.index_with(vocab) train_data_loader = DataLoader(train_dataset, batch_sampler=BucketBatchSampler( train_dataset, batch_size=32, sorting_keys=["tokens"])) dev_data_loader = DataLoader(dev_dataset, batch_sampler=BucketBatchSampler( dev_dataset, batch_size=32, sorting_keys=["tokens"])) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) trainer = GradientDescentTrainer( model=model, optimizer=optimizer, data_loader=train_data_loader, validation_data_loader=dev_data_loader, patience=10, num_epochs=20) trainer.train() predictor = SentenceClassifierPredictor(model, dataset_reader=reader) # logits = predictor.predict('This is the best movie ever!')['logits'] logits = predictor.predict('''On August 28, Mustafa varank, Turkey's minister of industry and technology, said Turkey plans to become a production center for automotive batteries by investing in cells, battery modules and battery packs. The country also hopes to become Europe's largest and the world's top five electric and autopilot auto makers by 2030. In order to achieve this goal, varank said Turkey would support the investment of electronic and electrical companies in the automotive industry. Varank points out that modern Turkish plants will cover half of the world's I20 capacity, 90% of which is expected to be exported abroad. "It took 27 months to build this line, with a total investment of $194 million. The productivity of I20 in Turkey will exceed 60%, which will increase gradually. In the past year, Turkey has developed EMUs, SUVs, tractors and excavators equipped with electric engines, and now plans to develop electric vehicle technology. Varank said Turkey would build an ecosystem to produce key components for electric vehicles, such as electric engines, inverters, charging equipment and compressors. He stressed that the automobile industry is the "locomotive" of Turkey's industrial sector, which also provides advantages for other industries. In May and June this year, Turkey's industrial production increased by double-digit compared with the same period last year. In the first half of 2020, Turkey issued 1200 investment award certificates worth US $108 billion (about US $16.7 billion) and created 163000 new jobs. On August 28, Turkey released its economic confidence index for August, and varank said: "the positive trend continues, and our citizens have more positive expectations for the post epidemic period." Choi Hong GHI, South Korea's ambassador to Ankara, said that Hyundai Motor, one of the world's top five auto manufacturers, established its first overseas factory in Turkey 23 years ago. "Hyundai's zmit factory is a symbol of economic cooperation between the two countries, which directly promotes employment and exports in Turkey." Eckkyun Oh, chief executive of Hyundai assan, said the company has produced more than two million cars in Turkey, most of which are exported to countries in Europe, the Middle East and North Africa. "We will produce 100000 new I20 cars here," he said.''')['logits'] label_id = np.argmax(logits) print(model.vocab.get_token_from_index(label_id, 'labels'))
from resolution.common.data.reader.bert_pointer_rewrite_reader import BertPointerRewriteReader basename = "/home/zs261988/" data_path = "data/" model_path = "models/ptms/" model_name = "albert_void_tiny/" vocab_file = "vocab.txt" sample_file = "rewrite/sample_100.txt" reader = BertPointerRewriteReader( model_name=basename + model_path + model_name, vocab_file=basename + model_path + model_name + vocab_file) # 读取数据 train_data = reader.read(Path(basename) / data_path / sample_file) # 获取Vocabulary vocab = reader.vocab train_data.vocab = vocab print("[PAD]: ", vocab.get_token_index("[PAD]", namespace="bert_tags")) print("[CLS]: ", vocab.get_token_index("[CLS]", namespace="bert_tags")) print("[SEP]: ", vocab.get_token_index("[SEP]", namespace="bert_tags")) datasampler = BucketBatchSampler(train_data, batch_size=16) dataloader = DataLoader(dataset=train_data, batch_sampler=datasampler) for i, batch in enumerate(dataloader): print(batch) if i > 0: break
def main(): # load the binary SST dataset. single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences. reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class", token_indexers={"tokens": single_id_indexer}, use_subtrees=True) train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt') reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class", token_indexers={"tokens": single_id_indexer}) dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt') # test_dataset = reader.read('data/sst/test.txt') vocab = Vocabulary.from_instances(train_data) train_data.index_with(vocab) dev_data.index_with(vocab) # Randomly initialize vectors if EMBEDDING_TYPE == "None": token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300) word_embedding_dim = 300 # Load word2vec vectors elif EMBEDDING_TYPE == "w2v": embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip" weight = _read_pretrained_embeddings_file(embedding_path, embedding_dim=300, vocab=vocab, namespace="tokens") token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300, weight=weight, trainable=False) word_embedding_dim = 300 # Initialize model, cuda(), and optimizer word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim, hidden_size=512, num_layers=2, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) model.cuda() # where to save the model model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th" vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab" # if the model already exists (its been trained), load the pre-trained weights and vocabulary if os.path.isfile(model_path): vocab = Vocabulary.from_files(vocab_path) model = LstmClassifier(word_embeddings, encoder, vocab) with open(model_path, 'rb') as f: model.load_state_dict(torch.load(f)) # otherwise train model from scratch and save its weights else: train_sampler = BucketBatchSampler(train_data, batch_size=32, sorting_keys=[("tokens")]) dev_sampler = BucketBatchSampler(dev_data, batch_size=32, sorting_keys=[("tokens")]) train_loader = DataLoader(train_data, batch_sampler=train_sampler) dev_loader = DataLoader(dev_data, batch_sampler=dev_sampler) optimizer = optim.Adam(model.parameters()) trainer = GradientDescentTrainer(model=model, optimizer=optimizer, data_loader=train_loader, validation_data_loader=dev_loader, num_epochs=5, patience=1, cuda_device=0) trainer.train() with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files(vocab_path) model.train().cuda() # rnn cannot do backwards in eval mode # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings. # We use the gradient later in the attack. utils.add_hooks(model) embedding_weight = utils.get_embedding_weight(model) # also save the word embedding matrix # Build k-d Tree if you are using gradient + nearest neighbor attack # tree = KDTree(embedding_weight.numpy()) # filter the dataset to only positive or negative examples # (the trigger will cause the opposite prediction) dataset_label_filter = "0" targeted_dev_data = [] for instance in dev_data: if instance['label'].label == dataset_label_filter: targeted_dev_data.append(instance) targeted_dev_data = AllennlpDataset(targeted_dev_data, vocab) # get accuracy before adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None) model.train() # rnn cannot do backwards in eval mode # initialize triggers which are concatenated to the input num_trigger_tokens = 3 trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens # Use batches of size universal_perturb_batch_size for the attacks. universal_perturb_batch_size = 128 targeted_sampler = BasicBatchSampler(sampler=SequentialSampler(targeted_dev_data), batch_size=universal_perturb_batch_size, drop_last=False) # TODO don't drop last targeted_loader = DataLoader(targeted_dev_data, batch_sampler=targeted_sampler) # sample batches, update the triggers, and repeat for epoch in range(5): for batch in targeted_loader: # get accuracy with current triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids) model.train() # rnn cannot do backwards in eval mode # get gradient w.r.t. trigger embeddings for current batch averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids) # pass the gradients to a particular attack to generate token candidates for each token. cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad, embedding_weight, trigger_token_ids, num_candidates=40, increase_loss=True) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # num_candidates=40, # increase_loss=True) # Tries all of the candidates and returns the trigger sequence with highest loss. trigger_token_ids = utils.get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids) # print accuracy after adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
def main(): # Load SNLI dataset bert_indexer = PretrainedTransformerIndexer('bert-base-uncased') tokenizer = PretrainedTransformerTokenizer(model_name='bert-base-uncased') reader = SnliReader(token_indexers={'tokens': bert_indexer}, tokenizer=tokenizer, combine_input_fields=True) # single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer # tokenizer = WordTokenizer(end_tokens=["@@NULL@@"]) # add @@NULL@@ to the end of sentences # reader = SnliReader(token_indexers={'tokens': single_id_indexer}, tokenizer=tokenizer) dev_dataset = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl' ) # Load model and vocab model_type = "pred" # model_type = "merged" if model_type == "merged": model = load_archive( '/home/junliw/gradient-regularization/SNLI/archives/bert_models/merged_model.tar.gz' ).model elif model_type == "pred": model = load_archive( '/home/junliw/gradient-regularization/SNLI/archives/bert_models/bert_trained2.tar.gz' ).model model.eval().cuda() vocab = model.vocab # add hooks for embeddings so we can compute gradients w.r.t. to the input tokens utils.add_hooks(model) if model_type == "merged": embedding_weight = model.combined_model._text_field_embedder._modules[ "token_embedder_tokens"].transformer_model.embeddings.word_embeddings.weight # save the word embedding matrix else: embedding_weight = model._text_field_embedder._modules[ "token_embedder_tokens"].transformer_model.embeddings.word_embeddings.weight # print(model.combined_model._text_field_embedder._modules["token_embedder_tokens"].transformer_model.embeddings.word_embeddings) # print(embedding_weight.size()) # Batches of examples to construct triggers universal_perturb_batch_size = 32 # iterator = DataIterator(batch_size=universal_perturb_batch_size) # iterator.index_with(vocab) # Subsample the dataset to one class to do a universal attack on that class dataset_label_filter = 'entailment' # only entailment examples # dataset_label_filter = 'contradiction' # only contradiction examples # dataset_label_filter = 'neutral' # only neutral examples subset_dev_dataset = [] for instance in dev_dataset: if instance['label'].label == dataset_label_filter: subset_dev_dataset.append(instance) print(len(subset_dev_dataset)) print(len(dev_dataset)) # the attack is targeted towards a specific class # target_label = "0" # flip to entailment target_label = "1" # flip to contradiction # target_label = "2" # flip to neutral # A k-d tree if you want to do gradient + nearest neighbors #tree = KDTree(embedding_weight.numpy()) # Get original accuracy before adding universal triggers utils.get_accuracy(model, subset_dev_dataset, vocab, tokenizer, model_type, trigger_token_ids=None, snli=True) model.train() # rnn cannot do backwards in train mode # Initialize triggers num_trigger_tokens = 2 # one token prepended start_tok = tokenizer.tokenizer.encode("a")[1] print(start_tok) trigger_token_ids = [start_tok] * num_trigger_tokens # sample batches, update the triggers, and repeat subset_dev_dataset_dataset = AllennlpDataset(dev_dataset, vocab) train_sampler = BucketBatchSampler(subset_dev_dataset_dataset, batch_size=universal_perturb_batch_size, sorting_keys=["tokens"]) train_dataloader = DataLoader(subset_dev_dataset_dataset, batch_sampler=train_sampler) # for batch in lazy_groups_of(iterators(subset_dev_dataset, num_epochs=10, shuffle=True), group_size=1): for batch in train_dataloader: # get model accuracy with current triggers utils.get_accuracy(model, subset_dev_dataset, vocab, tokenizer, model_type, trigger_token_ids, snli=True) model.train() # rnn cannot do backwards in train mode # get grad of triggers averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids, target_label, snli=True) # find attack candidates using an attack method cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad, embedding_weight, trigger_token_ids, increase_loss=False, num_candidates=40) print("------") print(cand_trigger_token_ids) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # decrease_prob=True) # query the model to get the best candidates trigger_token_ids = utils.get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids, snli=True)
# 构建词表 print("加载词表.........") vocab = Vocabulary(padding_token="[PAD]", oov_token="[UNK]") vocab.set_from_file(bert_path + "vocab.txt", is_padded=False, oov_token="[UNK]", namespace="bert_tags") # 构架reader和模型 print("定义模型........") reader = BertSpanResolutionReader(model_name=bert_path, max_turn_len=max_turn_len, max_length=max_length) model = BertSpanPointerResolution(vocab=vocab, model_name=bert_path, max_turn_len=max_turn_len, task_pretrained_file=Path(pretrained_file) / "best.th") model = model.eval() # 读取测试集数据 instances = reader.read(validation_data_path) instances.vocab = vocab datasampler = BucketBatchSampler(instances, batch_size=16) dataloader = DataLoader(dataset=instances, batch_sampler=datasampler) print("预测.........") # 读取数据并前向传播 with torch.no_grad(): for i, batch in tqdm(enumerate(dataloader)): output_dict = model(**batch) print("所有指标:", model.get_metrics())