def test_forward_runs_with_non_bijective_mapping(self): elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo' options_file = str(elmo_fixtures_path / 'options.json') weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5') params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file }, }, "embedder_to_indexer_map": {"words": ["words"], "elmo": ["elmo", "words"]} }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': (torch.rand(3, 6) * 20).long(), 'elmo': (torch.rand(3, 6, 50) * 15).long(), } token_embedder(inputs)
def test_forward_works_on_higher_order_input(self): params = Params({ "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "characters": { "type": "character_encoding", "embedding": { "embedding_dim": 4, "num_embeddings": 15, }, "encoder": { "type": "cnn", "embedding_dim": 4, "num_filters": 10, "ngram_filter_sizes": [3], }, } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': Variable(torch.rand(3, 4, 5, 6) * 20).long(), 'characters': Variable(torch.rand(3, 4, 5, 6, 7) * 15).long(), } assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)
def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) self.token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) self.inputs = { "words1": Variable(torch.LongTensor([[0, 2, 3, 5]])), "words2": Variable(torch.LongTensor([[1, 4, 3, 2]])), "words3": Variable(torch.LongTensor([[1, 5, 1, 2]])) }
def test_old_from_params_new_from_params(self): old_params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) # Allow loading the parameters in the old format with pytest.warns(DeprecationWarning): old_embedder = BasicTextFieldEmbedder.from_params(params=old_params, vocab=self.vocab) new_params = Params({ "token_embedders": { "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } } }) # But also allow loading the parameters in the new format new_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab) assert old_embedder._token_embedders.keys() == new_embedder._token_embedders.keys() #pylint: disable=protected-access assert new_embedder(self.inputs).size() == (1, 4, 10)
def test_old_from_params_new_from_params(self): old_params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) with pytest.warns(DeprecationWarning): BasicTextFieldEmbedder.from_params(params=old_params, vocab=self.vocab) new_params = Params({ "token_embedders": { "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } } }) token_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab) assert token_embedder(self.inputs).size() == (1, 4, 10)
def main(): token_indexer = SingleIdTokenIndexer() reader = JigsawDatasetReader( tokenizer=custom_tokenizer(), token_indexers={"tokens": token_indexer}, ) # Kaggle的多标签“恶意评论分类挑战 dataset_root = Path('../../data/jigsaw') train_dataset, dev_dataset = (reader.read( dataset_root / fname) for fname in ["train.csv", "test_proced.csv"]) print( f"total train samples: {len(train_dataset)}, dev samples: {len(dev_dataset)}" ) # 建立词汇表,从数据集中建立 vocab = Vocabulary.from_instances(train_dataset + dev_dataset) vocab_dim = vocab.get_vocab_size('tokens') print("vocab: ", vocab.get_vocab_size('labels'), vocab_dim) # 构建网络,此处网络为lstm-linear embedding_dim = 300 hidden_dim = 128 token_embedding = Embedding(num_embeddings=vocab_dim, embedding_dim=embedding_dim) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)) model = MultiLabelClassifier(word_embeddings, 0.5, encoder, 0.2, len(label_cols), vocab) # allennlp 目前好像不支持单机多卡,或者支持性能不好 gpu_id = 0 if torch.cuda.is_available() else -1 if gpu_id > -1: model.cuda(gpu_id) # 构建迭代器,并为迭代器指定vocab iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # --------------------- forward demo ---------------------- # generator = iter(iterator(train_dataset, shuffle=True)) # for _ in range(5): # batch = next(generator) # print('---\nbatch ', batch.keys(), batch['tokens'].keys(), batch['tokens']['tokens'].shape, batch['label'].shape) # [batch, sentence_len, token_len] # batch = move_to_device(batch, gpu_id) # tokens = batch['tokens'] # # # option1. forward one step by one # mask = get_text_field_mask(tokens) # embeddings = model.word_embeddings(tokens) # print("embeddings: ", embeddings.shape) # state = model.encoder(embeddings, mask) # class_logits = model.linear(state) # # print("lstm state: ", state.shape, class_logits.shape) # # # option2. do forward on the model # y = model(**batch) # metric = model.get_metrics() # print("model out: ", y, '\n', metric) # --------------------- train --------------------- optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, # serialization_dir="./models/", cuda_device=gpu_id, patience=10, num_epochs=20) trainer.train()
def EnhancedRCNN_train(): print("enter train") with open (model_config.glove_file_path) as fp: text = fp.readlines() # 这里如何优雅地解决这个初始counter的问题 glove_lines = len(text) token_counts = {"tokens": dict([(line.split(' ')[0], glove_lines - idx + 2) for idx, line in enumerate(text)])} #print(list(token_counts.items())[:10]) vocab = Vocabulary(counter=token_counts, min_count={"tokens": 1}, #non_padded_namespaces=['tokens'], pretrained_files={'tokens': model_config.glove_file_path}, only_include_pretrained_words=True) EMBEDDING_DIM = 300 token_embedding = Embedding.from_params( vocab=vocab, params=Params({ 'trainable': False, 'pretrained_file': model_config.glove_file_path, 'embedding_dim': EMBEDDING_DIM, 'vocab_namespace': "tokens"}) ) print("GloVe loaded") word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) model = EnhancedRCNNModel(word_embeddings, model_config.num_class, vocab=vocab) if torch.cuda.is_available(): cuda_device = list(range(torch.cuda.device_count())) model = model.cuda(cuda_device[0]) else: cuda_device = -1 print("cuda device : {}".format(cuda_device)) reader = ListWiseDatasetReader(vocab=vocab) train_dataset = reader.read(os.path.join(model_config.snli_base_path, "snli_1.0_train.jsonl")) dev_dataset = reader.read(os.path.join(model_config.snli_base_path, "snli_1.0_dev.jsonl")) test_dataset = reader.read(os.path.join(model_config.snli_base_path, "snli_1.0_test.jsonl")) #fc_lr = 1e-3 optimizer = torch.optim.SGD(model.parameters(), lr=model_config.learning_rate, momentum=0.9) ''' optimizer = torch.optim.SGD([{'params': model.embedder.parameters()}, {'params': model.fc1.parameters(), 'lr': fc_lr}, {'params': model.fc2.parameters(), 'lr': fc_lr}, {'params': model.proj_1.parameters(), 'lr': fc_lr}, {'params': model.proj_2.parameters(), 'lr': fc_lr}, {'params': model.bert_prediction.parameters(), 'lr': fc_lr}, ], lr=model_config.learning_rate, momentum=0.9) ''' #optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) iterator_train = BucketIterator(batch_size=model_config.batch_size, sorting_keys=[("left_input_tokens_field", "num_tokens"), ("right_input_tokens_field", "num_tokens")]) iterator_train.index_with(vocab) model.train() trainer = Trainer(model = model, optimizer = optimizer, iterator = iterator_train, train_dataset = train_dataset, validation_dataset = dev_dataset, patience = model_config.patience, num_epochs = model_config.epochs, cuda_device = cuda_device, shuffle=True ) train_start_time = time.time() trainer.train() train_end_time = time.time() # test model.eval() preds = [] gd = [] gd_pos = [] with torch.no_grad(): iterator_test = BucketIterator(batch_size = model_config.batch_size, sorting_keys=[("left_input_tokens_field", "num_tokens"), ("right_input_tokens_field", "num_tokens")]) iterator_test.index_with(vocab) generator_test = iterator_test(test_dataset, 1, False) test_start_time = time.time() for batch in generator_test: batch = move_to_device(batch, cuda_device[0]) gd.extend(batch['label'].squeeze(-1).long().cpu().numpy().tolist()) out_dict = model(batch['left_input_tokens_field'], batch['right_input_tokens_field'], batch['label']) batch_pred = torch.argmax(out_dict['logits'], -1).cpu().numpy() preds.extend(batch_pred.tolist()) sorted_batch, sorted_idx = torch.sort(out_dict['logits'], dim=-1, descending=True) label_mat = batch['label'].repeat(1, out_dict['logits'].shape[-1]).long().cuda() pos_mat = label_mat.eq(sorted_idx.cuda()) pos_tensor = pos_mat.nonzero()[:, 1].cpu().numpy().tolist() gd_pos.extend(pos_tensor) test_end_time = time.time() print("p@1 : ", (np.sum(np.equal(gd, preds))) / len(gd)) print("[train time] : {}".format(train_end_time - train_start_time)) print("[test time] : {}".format(test_end_time - test_start_time)) # 先检查文件是否存在,不存在则写入,存在则continue save_path = os.path.join(root_path, model_config.save_path) if os.path.exists(save_path): print("save path already exists") else: pd = pandas.DataFrame({'gd': gd, 'preds': preds}) pd.to_csv(save_path, index=False) print("save to path : {}".format(save_path))
def main(): reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='target_tokens') }) train_dataset = reader.read('data/tatoeba/tatoeba.eng_cmn.train.tsv') validation_dataset = reader.read('data/tatoeba/tatoeba.eng_cmn.dev.tsv') vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={ 'tokens': 3, 'target_tokens': 3 }) en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM) # encoder = PytorchSeq2SeqWrapper( # torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')()) # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM) attention = DotProductAttention() max_decoding_steps = 20 # TODO: make this variable model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=1, cuda_device=CUDA_DEVICE) for i in range(50): print('Epoch: {}'.format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
def train_main(): config = Config( testing=True, seed=1, batch_size=64, lr=3e-4, epochs=2, hidden_sz=64, max_seq_len=100, # necessary to limit memory usage max_vocab_size=100000, ) token_indexer = ELMoTokenCharactersIndexer() # 目标标签,普通恶评、严重恶评、污言秽语、威胁、侮辱和身份仇视 # label_cols = ["toxic", "severe_toxic", "obscene", # "threat", "insult", "identity_hate"] # reader = JigsawDatasetReader(tokenizer=tokenizer, # token_indexers={"tokens": token_indexer}, # label_cols=label_cols) # Kaggle的多标签“恶意评论分类挑战 # dataset_root = Path('/home/lirui/nlp/learning_allenNLP/data/jigsaw') # train_dataset, dev_dataset = (reader.read(dataset_root/ fname) for fname in ["train.csv", "test_proced.csv"]) # stanford 情感分类-sst5 数据集 reader = StanfordSentimentTreeBankDatasetReader(token_indexers={'tokens': token_indexer}) train_dataset = reader.read('~/nlp/dataset/sst/trees/train.txt') dev_dataset = reader.read('~/nlp/dataset/sst/trees/test.txt') print(f"total train samples: {len(train_dataset)}, dev samples: {len(dev_dataset)}") # 建立词汇表, vocab = Vocabulary.from_instances(train_dataset + dev_dataset) # pretrained elmo LM model, transformed from bilm-tf with dump_weights in bin/training.py options_file = '../models/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json' weight_file = '../models/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' token_embedding = ElmoTokenEmbedder(options_file, weight_file, requires_grad=True, # do_layer_norm=True ) # Pass in the ElmoTokenEmbedder instance instead word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) elmo_embedding_dim = word_embeddings.get_output_dim() hidden_dim = 256 encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(elmo_embedding_dim, hidden_dim, bidirectional=True,batch_first=True)) model = SSTClassifier(word_embeddings, encoder, out_dim=vocab.get_vocab_size("labels"), vocab=vocab) gpu_id = 0 if torch.cuda.is_available() else -1 if gpu_id > -1: model.cuda(gpu_id) # 构建迭代器,并为迭代器指定vocab iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # -------- forward demo --------- # generator = iter(iterator(train_dataset, shuffle=True)) # for _ in range(5): # batch = next(generator) # [batch, sentence_len, token_len] # print('---\nbatch ', batch.keys(), batch['tokens'].keys(), batch['tokens']['tokens'].shape, batch['label'].shape) # batch = nn_util.move_to_device(batch, 0 if use_gpu else -1) # # tokens = batch['tokens'] # mask = get_text_field_mask(tokens) # embeddings = model.word_embeddings(tokens) # print("embeddings: ", embeddings.shape) # state = model.encoder(embeddings, mask) # class_logits = model.linear(state) # # print("lstm state: ", state.shape, class_logits.shape) # # y = model(**batch) # print("model out: ", y) # # print("\nparams ") # for n, p in model.named_parameters(): # print(n, p.size()) # --------- train ------------ optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, # serialization_dir="./models/", cuda_device=gpu_id, patience=10, num_epochs=20) trainer.train()
# embeddings with respect to the vocabulary size of each of the relevant namespaces # in the vocabulary. word_embedding = Embedding(num_embeddings=vocab.get_vocab_size("token_ids"), embedding_dim=10) char_embedding = Embedding(num_embeddings=vocab.get_vocab_size("token_characters"), embedding_dim=5) character_cnn = CnnEncoder(embedding_dim=5, num_filters=2, output_dim=8) # This is going to embed an integer character tensor of shape: (batch_size, max_sentence_length, max_word_length) into # a 4D tensor with an additional embedding dimension, representing the vector for each character. # and then apply the character_cnn we defined above over the word dimension, resulting in a tensor # of shape: (batch_size, max_sentence_length, num_filters * ngram_filter_sizes). token_character_encoder = TokenCharactersEncoder(embedding=char_embedding, encoder=character_cnn) # Notice that these keys have the same keys as the TokenIndexers when we created our TextField. # This is how the text_field_embedder knows which function to apply to which array. # There should be a 1-1 mapping between TokenIndexers and TokenEmbedders in your model. text_field_embedder = BasicTextFieldEmbedder({"tokens": word_embedding, "characters": token_character_encoder}) # Convert the indexed dataset into Pytorch Variables. batch = Batch(instances) tensors = batch.as_tensor_dict(batch.get_padding_lengths()) print("Torch tensors for passing to a model: \n\n", tensors) print("\n\n") # tensors is a nested dictionary, first keyed by the # name we gave our instances (in most cases you'd have more # than one field in an instance) and then by the key of each # token indexer we passed to TextField. # This will contain two tensors: one from representing each # word as an index and one representing each _character_ # in each word as an index. text_field_variables = tensors["sentence"]
def build_model(args, vocab, pretrained_embs, tasks): '''Build model according to arguments args: - args (TODO): object with attributes: - vocab (Vocab): - pretrained_embs (TODO): word embeddings to use returns ''' d_word, n_layers_highway = args.d_word, args.n_layers_highway # Build embedding layers if args.glove: word_embs = pretrained_embs train_embs = bool(args.train_words) else: log.info("\tLearning embeddings from scratch!") word_embs = None train_embs = True word_embedder = Embedding( vocab.get_vocab_size('tokens'), d_word, weight=word_embs, trainable=train_embs, padding_index=vocab.get_token_index('@@PADDING@@')) d_inp_phrase = 0 # Handle elmo and cove token_embedder = {} if args.elmo: log.info("\tUsing ELMo embeddings!") if args.deep_elmo: n_reps = 2 log.info("\tUsing deep ELMo embeddings!") else: n_reps = 1 if args.elmo_no_glove: log.info("\tNOT using GLoVe embeddings!") else: token_embedder = {"words": word_embedder} log.info("\tUsing GLoVe embeddings!") d_inp_phrase += d_word elmo = Elmo(options_file=ELMO_OPT_PATH, weight_file=ELMO_WEIGHTS_PATH, num_output_representations=n_reps) d_inp_phrase += 1024 else: elmo = None token_embedder = {"words": word_embedder} d_inp_phrase += d_word text_field_embedder = BasicTextFieldEmbedder(token_embedder) if "words" in token_embedder \ else None d_hid_phrase = args.d_hid if args.pair_enc != 'bow' else d_inp_phrase if args.cove: cove_layer = cove_lstm(n_vocab=vocab.get_vocab_size('tokens'), vectors=word_embedder.weight.data) d_inp_phrase += 600 log.info("\tUsing CoVe embeddings!") else: cove_layer = None # Build encoders phrase_layer = s2s_e.by_name('lstm').from_params( Params({ 'input_size': d_inp_phrase, 'hidden_size': d_hid_phrase, 'num_layers': args.n_layers_enc, 'bidirectional': True })) if args.pair_enc == 'bow': sent_encoder = BoWSentEncoder( vocab, text_field_embedder) # maybe should take in CoVe/ELMO? pair_encoder = None # model will just run sent_encoder on both inputs else: # output will be 2 x d_hid_phrase (+ deep elmo) sent_encoder = HeadlessSentEncoder(vocab, text_field_embedder, n_layers_highway, phrase_layer, dropout=args.dropout, cove_layer=cove_layer, elmo_layer=elmo) d_single = 2 * d_hid_phrase + (args.elmo and args.deep_elmo) * 1024 if args.pair_enc == 'simple': # output will be 4 x [2 x d_hid_phrase (+ deep elmo)] pair_encoder = HeadlessPairEncoder(vocab, text_field_embedder, n_layers_highway, phrase_layer, cove_layer=cove_layer, elmo_layer=elmo, dropout=args.dropout) d_pair = d_single elif args.pair_enc == 'attn': log.info("\tUsing attention!") d_inp_model = 4 * d_hid_phrase + (args.elmo and args.deep_elmo) * 1024 d_hid_model = d_hid_phrase # make it as large as the original sentence encoding modeling_layer = s2s_e.by_name('lstm').from_params( Params({ 'input_size': d_inp_model, 'hidden_size': d_hid_model, 'num_layers': 1, 'bidirectional': True })) pair_encoder = HeadlessPairAttnEncoder(vocab, text_field_embedder, n_layers_highway, phrase_layer, DotProductSimilarity(), modeling_layer, cove_layer=cove_layer, elmo_layer=elmo, deep_elmo=args.deep_elmo, dropout=args.dropout) d_pair = 2 * d_hid_phrase # output will be 4 x [2 x d_hid_model], where d_hid_model = 2 x d_hid_phrase # = 4 x [2 x 2 x d_hid_phrase] # Build model and classifiers model = MultiTaskModel(args, sent_encoder, pair_encoder) build_classifiers(tasks, model, d_pair, d_single) if args.cuda >= 0: model = model.cuda() return model
def main(): parser = argparse.ArgumentParser(description='Evidence Inference experiments') parser.add_argument('--cuda_device', type=int, default=0, help='GPU number (default: 0)') parser.add_argument('--epochs', type=int, default=2, help='upper epoch limit (default: 2)') parser.add_argument('--patience', type=int, default=1, help='trainer patience (default: 1)') parser.add_argument('--batch_size', type=int, default=8, help='batch size (default: 8)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout for the model (default: 0.2)') parser.add_argument('--emb_size', type=int, default=256, help='elmo embeddings size (default: 256)') parser.add_argument('--model_name', type=str, default='attention', help='model name (default: attention)') parser.add_argument('--tunable', action='store_true', help='tune the underlying embedding model (default: False)') args = parser.parse_args() processed_annotations = pickle.load(open('data/data/p_annotations.p', 'rb')) prompts = pd.read_csv('data/data/prompts_merged.csv') prompts_dictionary = {} for index, row in prompts.iterrows(): prompts_dictionary[row['PromptID']] = [row['Outcome'], row['Intervention'], row['Comparator']] for article_key in processed_annotations: for article_item in processed_annotations[article_key]: article_item += prompts_dictionary[article_item[-1]] train = [] valid = [] test = [] with open('data/splits/train_article_ids.txt') as train_file: for line in train_file: train.append(int(line.strip())) with open('data/splits/validation_article_ids.txt') as valid_file: for line in valid_file: valid.append(int(line.strip())) with open('data/splits/test_article_ids.txt') as test_file: for line in test_file: test.append(int(line.strip())) bert_token_indexer = {'bert': PretrainedBertIndexer('scibert/vocab.txt', max_pieces=512)} reader = EIDatasetReader(bert_token_indexer, processed_annotations) train_data = reader.read(train) valid_data = reader.read(valid) test_data = reader.read(test) vocab = Vocabulary.from_instances(train_data + valid_data + test_data) bert_token_embedding = PretrainedBertEmbedder( 'scibert/weights.tar.gz', requires_grad=args.tunable ) word_embeddings = BasicTextFieldEmbedder( {"bert": bert_token_embedding}, {"bert": ['bert']}, allow_unmatched_keys=True ) model = Baseline(word_embeddings, vocab) global cuda_device cuda_device = args.cuda_device if torch.cuda.is_available(): logger.info('Running on GPU') model = model.cuda(cuda_device) else: logger.info('Running on CPU') cuda_device = -1 optimizer = torch.optim.Adam(model.parameters(), lr=0.001) iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[('article', 'num_fields')], padding_noise=0.1) iterator.index_with(vocab) serialization_dir = 'model_checkpoints/' + args.model_name trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=test_data, patience=args.patience, validation_metric='+accuracy', num_epochs=args.epochs, cuda_device=cuda_device, serialization_dir=serialization_dir) result = trainer.train() for key in result: print(str(key) + ': ' + str(result[key])) test_metrics = evaluate(trainer.model, test_data, iterator, cuda_device=cuda_device, batch_weight_key="") print('Test Data statistics:') for key, value in test_metrics.items(): print(str(key) + ': ' + str(value))
return {"accuracy": self.accuracy.get_metric(reset)} reader = PosDatasetReader() train_dataset = reader.read( cached_path('https://raw.githubusercontent.com/allenai/allennlp' '/master/tutorials/tagger/training.txt')) validation_dataset = reader.read( cached_path('https://raw.githubusercontent.com/allenai/allennlp' '/master/tutorials/tagger/validation.txt')) vocab = Vocabulary.from_instances(train_dataset + validation_dataset) EMBEDDING_DIM = 6 HIDDEN_DIM = 6 token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmTagger(word_embeddings, lstm, vocab) optimizer = optim.SGD(model.parameters(), lr=0.1) iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=1000) trainer.train()
reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')}, lazy=True) vocab = Vocabulary.from_files('/home/earendil/NLP/neural_machine_translation/checkpoint_vocab_epoch_13') en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM) encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) attention = DotProductAttention() max_decoding_steps = 300 model_pred = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) # Reload the trained model. with open('/home/earendil/NLP/neural_machine_translation/checkpoint_model_epoch_13', 'rb') as f: model_pred.load_state_dict(torch.load(f, map_location=torch.device('cpu'))) model_pred.eval()
def __init__(self): config = conf['seq2seq_allen'] prefix = config['processed_data_prefix'] train_file = config['train_data'] valid_file = config['valid_data'] src_embedding_dim = config['src_embedding_dim'] trg_embedding_dim = config['trg_embedding_dim'] hidden_dim = config['hidden_dim'] if torch.cuda.is_available(): cuda_device = 0 else: cuda_device = -1 self.reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=WordTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='target_tokens') }) self.train_dataset = self.reader.read(os.path.join(prefix, train_file)) self.valid_dataset = self.reader.read(os.path.join(prefix, valid_file)) vocab = Vocabulary.from_instances(self.train_dataset + self.valid_dataset, min_count={ 'tokens': 3, 'target_tokens': 3 }) src_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=src_embedding_dim) encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True)) source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding}) attention = LinearAttention(hidden_dim, hidden_dim, activation=Activation.by_name('tanh')()) self.model = SimpleSeq2Seq( vocab=vocab, source_embedder=source_embedder, encoder=encoder, max_decoding_steps=20, target_embedding_dim=trg_embedding_dim, target_namespace='target_tokens', attention=attention, # pass attention use_bleu=True) optimizer = optim.Adam(self.model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens") ]) # 迭代器需要接受vocab,在训练时可以用vocab来index数据 iterator.index_with(vocab) self.model.cuda(cuda_device) self.trainer = Trainer(model=self.model, optimizer=optimizer, iterator=iterator, patience=10, validation_metric="+accuracy", train_dataset=self.train_dataset, validation_dataset=self.valid_dataset, num_epochs=1, cuda_device=cuda_device)
class TransformerQA(Model): """ Registered as `"transformer_qa"`, this class implements a reading comprehension model patterned after the proposed model in [Devlin et al]([email protected]:huggingface/transformers.git), with improvements borrowed from the SQuAD model in the transformers project. It predicts start tokens and end tokens with a linear layer on top of word piece embeddings. If you want to use this model on SQuAD datasets, you can use it with the [`TransformerSquadReader`](../../dataset_readers/transformer_squad#transformersquadreader) dataset reader, registered as `"transformer_squad"`. Note that the metrics that the model produces are calculated on a per-instance basis only. Since there could be more than one instance per question, these metrics are not the official numbers on either SQuAD task. To get official numbers for SQuAD v1.1, for example, you can run ``` python -m allennlp_models.rc.tools.transformer_qa_eval ``` # Parameters vocab : `Vocabulary` transformer_model_name : `str`, optional (default=`'bert-base-cased'`) This model chooses the embedder according to this setting. You probably want to make sure this is set to the same thing as the reader. """ def __init__(self, vocab: Vocabulary, transformer_model_name: str = "bert-base-cased", **kwargs) -> None: super().__init__(vocab, **kwargs) self._text_field_embedder = BasicTextFieldEmbedder( {"tokens": PretrainedTransformerEmbedder(transformer_model_name)}) self._linear_layer = nn.Linear( self._text_field_embedder.get_output_dim(), 2) self._span_start_accuracy = CategoricalAccuracy() self._span_end_accuracy = CategoricalAccuracy() self._span_accuracy = BooleanAccuracy() self._per_instance_metrics = SquadEmAndF1() def forward( # type: ignore self, question_with_context: Dict[str, Dict[str, torch.LongTensor]], context_span: torch.IntTensor, cls_index: torch.LongTensor = None, answer_span: torch.IntTensor = None, metadata: List[Dict[str, Any]] = None, ) -> Dict[str, torch.Tensor]: """ # Parameters question_with_context : `Dict[str, torch.LongTensor]` From a `TextField`. The model assumes that this text field contains the context followed by the question. It further assumes that the tokens have type ids set such that any token that can be part of the answer (i.e., tokens from the context) has type id 0, and any other token (including `[CLS]` and `[SEP]`) has type id 1. context_span : `torch.IntTensor` From a `SpanField`. This marks the span of word pieces in `question` from which answers can come. cls_index : `torch.LongTensor`, optional A tensor of shape `(batch_size,)` that provides the index of the `[CLS]` token in the `question_with_context` for each instance. This is needed because the `[CLS]` token is used to indicate that the question is impossible. If this is `None`, it's assumed that the `[CLS]` token is at index 0 for each instance in the batch. answer_span : `torch.IntTensor`, optional From a `SpanField`. This is the thing we are trying to predict - the span of text that marks the answer. If given, we compute a loss that gets included in the output directory. metadata : `List[Dict[str, Any]]`, optional If present, this should contain the question id, and the original texts of context, question, tokenized version of both, and a list of possible answers. The length of the `metadata` list should be the batch size, and each dictionary should have the keys `id`, `question`, `context`, `question_tokens`, `context_tokens`, and `answers`. # Returns `Dict[str, torch.Tensor]` : An output dictionary with the following fields: - span_start_logits (`torch.FloatTensor`) : A tensor of shape `(batch_size, passage_length)` representing unnormalized log probabilities of the span start position. - span_end_logits (`torch.FloatTensor`) : A tensor of shape `(batch_size, passage_length)` representing unnormalized log probabilities of the span end position (inclusive). - best_span_scores (`torch.FloatTensor`) : The score for each of the best spans. - loss (`torch.FloatTensor`, optional) : A scalar loss to be optimised, evaluated against `answer_span`. - best_span (`torch.IntTensor`, optional) : Provided when not in train mode and sufficient metadata given for the instance. The result of a constrained inference over `span_start_logits` and `span_end_logits` to find the most probable span. Shape is `(batch_size, 2)` and each offset is a token index, unless the best span for an instance was predicted to be the `[CLS]` token, in which case the span will be (-1, -1). - best_span_str (`List[str]`, optional) : Provided when not in train mode and sufficient metadata given for the instance. This is the string from the original passage that the model thinks is the best answer to the question. """ embedded_question = self._text_field_embedder(question_with_context) # shape: (batch_size, sequence_length, 2) logits = self._linear_layer(embedded_question) # shape: (batch_size, sequence_length, 1) span_start_logits, span_end_logits = logits.split(1, dim=-1) # shape: (batch_size, sequence_length) span_start_logits = span_start_logits.squeeze(-1) # shape: (batch_size, sequence_length) span_end_logits = span_end_logits.squeeze(-1) # Create a mask for `question_with_context` to mask out tokens that are not part # of the context. # shape: (batch_size, sequence_length) possible_answer_mask = torch.zeros_like( get_token_ids_from_text_field_tensors(question_with_context), dtype=torch.bool) for i, (start, end) in enumerate(context_span): possible_answer_mask[i, start:end + 1] = True # Also unmask the [CLS] token since that token is used to indicate that # the question is impossible. possible_answer_mask[ i, 0 if cls_index is None else cls_index[i]] = True # Replace the masked values with a very negative constant since we're in log-space. # shape: (batch_size, sequence_length) span_start_logits = replace_masked_values_with_big_negative_number( span_start_logits, possible_answer_mask) # shape: (batch_size, sequence_length) span_end_logits = replace_masked_values_with_big_negative_number( span_end_logits, possible_answer_mask) # Now calculate the best span. # shape: (batch_size, 2) best_spans = get_best_span(span_start_logits, span_end_logits) # Sum the span start score with the span end score to get an overall score for the span. # shape: (batch_size,) best_span_scores = torch.gather( span_start_logits, 1, best_spans[:, 0].unsqueeze(1)) + torch.gather( span_end_logits, 1, best_spans[:, 1].unsqueeze(1)) best_span_scores = best_span_scores.squeeze(1) output_dict = { "span_start_logits": span_start_logits, "span_end_logits": span_end_logits, "best_span_scores": best_span_scores, } # Compute the loss. if answer_span is not None: output_dict["loss"] = self._evaluate_span(best_spans, span_start_logits, span_end_logits, answer_span) # Gather the string of the best span and compute the EM and F1 against the gold span, # if given. if not self.training and metadata is not None: ( output_dict["best_span_str"], output_dict["best_span"], ) = self._collect_best_span_strings(best_spans, context_span, metadata, cls_index) return output_dict def _evaluate_span( self, best_spans: torch.Tensor, span_start_logits: torch.Tensor, span_end_logits: torch.Tensor, answer_span: torch.Tensor, ) -> torch.Tensor: """ Calculate the loss against the `answer_span` and also update the span metrics. """ span_start = answer_span[:, 0] span_end = answer_span[:, 1] self._span_accuracy(best_spans, answer_span) start_loss = cross_entropy(span_start_logits, span_start, ignore_index=-1) big_constant = min(torch.finfo(start_loss.dtype).max, 1e9) assert not torch.any(start_loss > big_constant), "Start loss too high" end_loss = cross_entropy(span_end_logits, span_end, ignore_index=-1) assert not torch.any(end_loss > big_constant), "End loss too high" self._span_start_accuracy(span_start_logits, span_start) self._span_end_accuracy(span_end_logits, span_end) return (start_loss + end_loss) / 2 def _collect_best_span_strings( self, best_spans: torch.Tensor, context_span: torch.IntTensor, metadata: List[Dict[str, Any]], cls_index: Optional[torch.LongTensor], ) -> Tuple[List[str], torch.Tensor]: """ Collect the string of the best predicted span from the context metadata and update `self._per_instance_metrics`, which in the case of SQuAD v1.1 / v2.0 includes the EM and F1 score. This returns a `Tuple[List[str], torch.Tensor]`, where the `List[str]` is the predicted answer for each instance in the batch, and the tensor is just the input tensor `best_spans` after adjustments so that each answer span corresponds to the context tokens only, and not the question tokens. Spans that correspond to the `[CLS]` token, i.e. the question was predicted to be impossible, will be set to `(-1, -1)`. """ _best_spans = best_spans.detach().cpu().numpy() best_span_strings: List[str] = [] best_span_strings_for_metric: List[str] = [] answer_strings_for_metric: List[List[str]] = [] for (metadata_entry, best_span, cspan, cls_ind) in zip( metadata, _best_spans, context_span, cls_index or (0 for _ in range(len(metadata))), ): context_tokens_for_question = metadata_entry["context_tokens"] if best_span[0] == cls_ind: # Predicting [CLS] is interpreted as predicting the question as unanswerable. best_span_string = "" # NOTE: even though we've "detached" 'best_spans' above, this still # modifies the original tensor in-place. best_span[0], best_span[1] = -1, -1 else: best_span -= int(cspan[0]) assert np.all(best_span >= 0) predicted_start, predicted_end = tuple(best_span) while (predicted_start >= 0 and context_tokens_for_question[predicted_start].idx is None): predicted_start -= 1 if predicted_start < 0: logger.warning( f"Could not map the token '{context_tokens_for_question[best_span[0]].text}' at index " f"'{best_span[0]}' to an offset in the original text.") character_start = 0 else: character_start = context_tokens_for_question[ predicted_start].idx while (predicted_end < len(context_tokens_for_question) and context_tokens_for_question[predicted_end].idx is None): predicted_end += 1 if predicted_end >= len(context_tokens_for_question): logger.warning( f"Could not map the token '{context_tokens_for_question[best_span[1]].text}' at index " f"'{best_span[1]}' to an offset in the original text.") character_end = len(metadata_entry["context"]) else: end_token = context_tokens_for_question[predicted_end] character_end = end_token.idx + len( sanitize_wordpiece(end_token.text)) best_span_string = metadata_entry["context"][ character_start:character_end] best_span_strings.append(best_span_string) answers = metadata_entry.get("answers") if answers: best_span_strings_for_metric.append(best_span_string) answer_strings_for_metric.append(answers) if answer_strings_for_metric: self._per_instance_metrics(best_span_strings_for_metric, answer_strings_for_metric) return best_span_strings, best_spans def get_metrics(self, reset: bool = False) -> Dict[str, float]: output = { "start_acc": self._span_start_accuracy.get_metric(reset), "end_acc": self._span_end_accuracy.get_metric(reset), "span_acc": self._span_accuracy.get_metric(reset), } if not self.training: exact_match, f1_score = self._per_instance_metrics.get_metric( reset) output["per_instance_em"] = exact_match output["per_instance_f1"] = f1_score return output default_predictor = "transformer_qa"
# vocab_namespace="source_char_tokens", # vocab=vocab) # src_char_encoder = TokenCharactersEncoder(embedding=src_char_embedding, # encoder=GruSeq2VecEncoder(input_size=args.emb_dim, # hidden_size=args.hid_dim)) tgt_embedding = Embedding(embedding_dim=args.emb_dim, vocab_namespace="target_tokens", vocab=vocab) # tgt_char_embedding = Embedding(embedding_dim=args.emb_dim, # vocab_namespace="target_char_tokens", # vocab=vocab) # tgt_char_encoder = TokenCharactersEncoder(embedding=tgt_char_embedding, # encoder=GruSeq2VecEncoder(input_size=args.emb_dim, # hidden_size=args.hid_dim)) src_embedders = BasicTextFieldEmbedder({ "tokens": src_embedding, # "character_tokens": src_char_encoder }) # tgt_embedders = BasicTextFieldEmbedder({ # "tokens": tgt_embedding, # "character_tokens": tgt_char_encoder # }) train_loader = SimpleDataLoader.from_dataset_reader( reader=dataset_reader, data_path=args.train_file, batch_size=args.bs, shuffle=True) train_loader.index_with(vocab) val_loader = SimpleDataLoader.from_dataset_reader( reader=dataset_reader, data_path=args.valid_file, batch_size=args.bs) val_loader.index_with(vocab)
def __init__(self, vocab: Vocabulary, params: Params, regularizer: RegularizerApplicator = None): super(LayerNerEmdRelation, self).__init__(vocab=vocab, regularizer=regularizer) # Base text Field Embedder text_field_embedder_params = params.pop("text_field_embedder") text_field_embedder = BasicTextFieldEmbedder.from_params( vocab=vocab, params=text_field_embedder_params) self._text_field_embedder = text_field_embedder ############ # NER Stuffs ############ ner_params = params.pop("ner") # Encoder encoder_ner_params = ner_params.pop("encoder") encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params) self._encoder_ner = encoder_ner # Tagger NER - CRF Tagger tagger_ner_params = ner_params.pop("tagger") tagger_ner = CrfTagger( vocab=vocab, text_field_embedder=self._text_field_embedder, encoder=self._encoder_ner, label_namespace=tagger_ner_params.pop("label_namespace", "labels"), constraint_type=tagger_ner_params.pop("constraint_type", None), dropout=tagger_ner_params.pop("dropout", None), regularizer=regularizer, ) self._tagger_ner = tagger_ner ############ # EMD Stuffs ############ emd_params = params.pop("emd") # Encoder encoder_emd_params = emd_params.pop("encoder") encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params) self._encoder_emd = encoder_emd shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder( base_text_field_embedder=self._text_field_embedder, previous_encoders=[self._encoder_ner]) self._shortcut_text_field_embedder = shortcut_text_field_embedder # Tagger: EMD - CRF Tagger tagger_emd_params = emd_params.pop("tagger") tagger_emd = CrfTagger( vocab=vocab, text_field_embedder=self._shortcut_text_field_embedder, encoder=self._encoder_emd, label_namespace=tagger_emd_params.pop("label_namespace", "labels"), constraint_type=tagger_emd_params.pop("constraint_type", None), dropout=tagger_ner_params.pop("dropout", None), regularizer=regularizer, ) self._tagger_emd = tagger_emd ############################ # Relation Extraction Stuffs ############################ relation_params = params.pop("relation") # Encoder encoder_relation_params = relation_params.pop("encoder") encoder_relation = Seq2SeqEncoder.from_params(encoder_relation_params) self._encoder_relation = encoder_relation shortcut_text_field_embedder_relation = ShortcutConnectTextFieldEmbedder( base_text_field_embedder=self._text_field_embedder, previous_encoders=[self._encoder_ner, self._encoder_emd]) self._shortcut_text_field_embedder_relation = shortcut_text_field_embedder_relation # Tagger: Relation tagger_relation_params = relation_params.pop("tagger") tagger_relation = RelationExtractor( vocab=vocab, text_field_embedder=self._shortcut_text_field_embedder_relation, context_layer=self._encoder_relation, d=tagger_relation_params.pop_int("d"), l=tagger_relation_params.pop_int("l"), n_classes=tagger_relation_params.pop("n_classes"), activation=tagger_relation_params.pop("activation"), ) self._tagger_relation = tagger_relation logger.info("Multi-Task Learning Model has been instantiated.")
'/master/tutorials/tagger/validation.txt')) vocab = Vocabulary.from_instances(train_dataset + validation_dataset) EMBEDDING_DIM = 6 HIDDEN_DIM = 6 model_params = Params({ 'type': 'lstm', 'input_size': EMBEDDING_DIM, 'hidden_size': HIDDEN_DIM }) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embedding = BasicTextFieldEmbedder({'tokens': token_embedding}) lstm = Seq2SeqEncoder.from_params(model_params) model = POSTagger(word_embedding, lstm, vocab) optimizer = optim.SGD(model.parameters(), lr=0.1) iterator = BucketIterator(batch_size=2, sorting_keys=[('sentence', 'num_tokens')]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset,
def __init__(self, vocab: Vocabulary, params: Params, regularizer: RegularizerApplicator = None): super(LayerNerEmdCoref, self).__init__(vocab=vocab, regularizer=regularizer) # Base text Field Embedder text_field_embedder_params = params.pop("text_field_embedder") text_field_embedder = BasicTextFieldEmbedder.from_params( vocab=vocab, params=text_field_embedder_params) self._text_field_embedder = text_field_embedder ############ # NER Stuffs ############ ner_params = params.pop("ner") # Encoder encoder_ner_params = ner_params.pop("encoder") encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params) self._encoder_ner = encoder_ner # Tagger NER - CRF Tagger tagger_ner_params = ner_params.pop("tagger") tagger_ner = CrfTagger( vocab=vocab, text_field_embedder=self._text_field_embedder, encoder=self._encoder_ner, label_namespace=tagger_ner_params.pop("label_namespace", "labels"), constraint_type=tagger_ner_params.pop("constraint_type", None), dropout=tagger_ner_params.pop("dropout", None), regularizer=regularizer, ) self._tagger_ner = tagger_ner ############ # EMD Stuffs ############ emd_params = params.pop("emd") # Encoder encoder_emd_params = emd_params.pop("encoder") encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params) self._encoder_emd = encoder_emd shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder( base_text_field_embedder=self._text_field_embedder, previous_encoders=[self._encoder_ner]) self._shortcut_text_field_embedder = shortcut_text_field_embedder # Tagger: EMD - CRF Tagger tagger_emd_params = emd_params.pop("tagger") tagger_emd = CrfTagger( vocab=vocab, text_field_embedder=self._shortcut_text_field_embedder, encoder=self._encoder_emd, label_namespace=tagger_emd_params.pop("label_namespace", "labels"), constraint_type=tagger_emd_params.pop("constraint_type", None), dropout=tagger_ner_params.pop("dropout", None), regularizer=regularizer, ) self._tagger_emd = tagger_emd ############## # Coref Stuffs ############## coref_params = params.pop("coref") # Encoder encoder_coref_params = coref_params.pop("encoder") encoder_coref = Seq2SeqEncoder.from_params(encoder_coref_params) self._encoder_coref = encoder_coref shortcut_text_field_embedder_coref = ShortcutConnectTextFieldEmbedder( base_text_field_embedder=self._text_field_embedder, previous_encoders=[self._encoder_ner, self._encoder_emd]) self._shortcut_text_field_embedder_coref = shortcut_text_field_embedder_coref # Tagger: Coreference tagger_coref_params = coref_params.pop("tagger") eval_on_gold_mentions = tagger_coref_params.pop_bool( "eval_on_gold_mentions", False) init_params = tagger_coref_params.pop("initializer", None) initializer = (InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator()) tagger_coref = CoreferenceCustom( vocab=vocab, text_field_embedder=self._shortcut_text_field_embedder_coref, context_layer=self._encoder_coref, mention_feedforward=FeedForward.from_params( tagger_coref_params.pop("mention_feedforward")), antecedent_feedforward=FeedForward.from_params( tagger_coref_params.pop("antecedent_feedforward")), feature_size=tagger_coref_params.pop_int("feature_size"), max_span_width=tagger_coref_params.pop_int("max_span_width"), spans_per_word=tagger_coref_params.pop_float("spans_per_word"), max_antecedents=tagger_coref_params.pop_int("max_antecedents"), lexical_dropout=tagger_coref_params.pop_float( "lexical_dropout", 0.2), initializer=initializer, regularizer=regularizer, eval_on_gold_mentions=eval_on_gold_mentions, ) self._tagger_coref = tagger_coref if eval_on_gold_mentions: self._tagger_coref._eval_on_gold_mentions = True logger.info("Multi-Task Learning Model has been instantiated.")
class TransformerQA(Model): """ This class implements a reading comprehension model patterned after the proposed model in https://arxiv.org/abs/1810.04805 (Devlin et al), with improvements borrowed from the SQuAD model in the transformers project. It predicts start tokens and end tokens with a linear layer on top of word piece embeddings. Note that the metrics that the model produces are calculated on a per-instance basis only. Since there could be more than one instance per question, these metrics are not the official numbers on the SQuAD task. To get official numbers, run the script in scripts/transformer_qa_eval.py. Parameters ---------- vocab : ``Vocabulary`` transformer_model_name : ``str``, optional (default=``bert-base-cased``) This model chooses the embedder according to this setting. You probably want to make sure this is set to the same thing as the reader. """ def __init__(self, vocab: Vocabulary, transformer_model_name: str = "bert-base-cased", hidden_size=768, **kwargs) -> None: super().__init__(vocab, **kwargs) self._text_field_embedder = BasicTextFieldEmbedder({ "tokens": PretrainedTransformerEmbedder(transformer_model_name, hidden_size=hidden_size, task="QA") }) self._linear_layer = nn.Linear( self._text_field_embedder.get_output_dim(), 2) self._span_start_accuracy = CategoricalAccuracy() self._span_end_accuracy = CategoricalAccuracy() self._span_accuracy = BooleanAccuracy() self._per_instance_metrics = SquadEmAndF1() def forward( # type: ignore self, question_with_context: Dict[str, Dict[str, torch.LongTensor]], context_span: torch.IntTensor, answer_span: Optional[torch.IntTensor] = None, metadata: List[Dict[str, Any]] = None, ) -> Dict[str, torch.Tensor]: """ Parameters ---------- question_with_context : Dict[str, torch.LongTensor] From a ``TextField``. The model assumes that this text field contains the context followed by the question. It further assumes that the tokens have type ids set such that any token that can be part of the answer (i.e., tokens from the context) has type id 0, and any other token (including [CLS] and [SEP]) has type id 1. context_span : ``torch.IntTensor`` From a ``SpanField``. This marks the span of word pieces in ``question`` from which answers can come. answer_span : ``torch.IntTensor``, optional From a ``SpanField``. This is the thing we are trying to predict - the span of text that marks the answer. If given, we compute a loss that gets included in the output directory. metadata : ``List[Dict[str, Any]]``, optional If present, this should contain the question id, and the original texts of context, question, tokenized version of both, and a list of possible answers. The length of the ``metadata`` list should be the batch size, and each dictionary should have the keys ``id``, ``question``, ``context``, ``question_tokens``, ``context_tokens``, and ``answers``. Returns ------- An output dictionary consisting of: span_start_logits : torch.FloatTensor A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log probabilities of the span start position. span_start_probs : torch.FloatTensor The result of ``softmax(span_start_logits)``. span_end_logits : torch.FloatTensor A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log probabilities of the span end position (inclusive). span_end_probs : torch.FloatTensor The result of ``softmax(span_end_logits)``. best_span : torch.IntTensor The result of a constrained inference over ``span_start_logits`` and ``span_end_logits`` to find the most probable span. Shape is ``(batch_size, 2)`` and each offset is a token index. best_span_scores : torch.FloatTensor The score for each of the best spans. loss : torch.FloatTensor, optional A scalar loss to be optimised. best_span_str : List[str] If sufficient metadata was provided for the instances in the batch, we also return the string from the original passage that the model thinks is the best answer to the question. """ embedded_question = self._text_field_embedder(question_with_context) logits = self._linear_layer(embedded_question) span_start_logits, span_end_logits = logits.split(1, dim=-1) span_start_logits = span_start_logits.squeeze(-1) span_end_logits = span_end_logits.squeeze(-1) possible_answer_mask = torch.zeros_like( get_token_ids_from_text_field_tensors(question_with_context), dtype=torch.bool) for i, (start, end) in enumerate(context_span): possible_answer_mask[i, start:end + 1] = True span_start_logits = util.replace_masked_values(span_start_logits, possible_answer_mask, -1e32) span_end_logits = util.replace_masked_values(span_end_logits, possible_answer_mask, -1e32) span_start_probs = torch.nn.functional.softmax(span_start_logits, dim=-1) span_end_probs = torch.nn.functional.softmax(span_end_logits, dim=-1) best_spans = get_best_span(span_start_logits, span_end_logits) best_span_scores = torch.gather( span_start_logits, 1, best_spans[:, 0].unsqueeze(1)) + torch.gather( span_end_logits, 1, best_spans[:, 1].unsqueeze(1)) best_span_scores = best_span_scores.squeeze(1) output_dict = { "span_start_logits": span_start_logits, "span_start_probs": span_start_probs, "span_end_logits": span_end_logits, "span_end_probs": span_end_probs, "best_span": best_spans, "best_span_scores": best_span_scores, } # Compute the loss for training. if answer_span is not None: span_start = answer_span[:, 0] span_end = answer_span[:, 1] span_mask = span_start != -1 self._span_accuracy(best_spans, answer_span, span_mask.unsqueeze(-1).expand_as(best_spans)) start_loss = cross_entropy(span_start_logits, span_start, ignore_index=-1) if torch.any(start_loss > 1e9): logger.critical("Start loss too high (%r)", start_loss) logger.critical("span_start_logits: %r", span_start_logits) logger.critical("span_start: %r", span_start) assert False end_loss = cross_entropy(span_end_logits, span_end, ignore_index=-1) if torch.any(end_loss > 1e9): logger.critical("End loss too high (%r)", end_loss) logger.critical("span_end_logits: %r", span_end_logits) logger.critical("span_end: %r", span_end) assert False loss = (start_loss + end_loss) / 2 self._span_start_accuracy(span_start_logits, span_start, span_mask) self._span_end_accuracy(span_end_logits, span_end, span_mask) output_dict["loss"] = loss # Compute the EM and F1 on SQuAD and add the tokenized input to the output. if metadata is not None: best_spans = best_spans.detach().cpu().numpy() output_dict["best_span_str"] = [] context_tokens = [] for metadata_entry, best_span in zip(metadata, best_spans): context_tokens_for_question = metadata_entry["context_tokens"] context_tokens.append(context_tokens_for_question) best_span -= 1 + len(metadata_entry["question_tokens"]) + 2 assert np.all(best_span >= 0) predicted_start, predicted_end = tuple(best_span) while (predicted_start >= 0 and context_tokens_for_question[predicted_start].idx is None): predicted_start -= 1 if predicted_start < 0: logger.warning( f"Could not map the token '{context_tokens_for_question[best_span[0]].text}' at index " f"'{best_span[0]}' to an offset in the original text.") character_start = 0 else: character_start = context_tokens_for_question[ predicted_start].idx while (predicted_end < len(context_tokens_for_question) and context_tokens_for_question[predicted_end].idx is None): predicted_end += 1 if predicted_end >= len(context_tokens_for_question): logger.warning( f"Could not map the token '{context_tokens_for_question[best_span[1]].text}' at index " f"'{best_span[1]}' to an offset in the original text.") character_end = len(metadata_entry["context"]) else: end_token = context_tokens_for_question[predicted_end] character_end = end_token.idx + len( sanitize_wordpiece(end_token.text)) best_span_string = metadata_entry["context"][ character_start:character_end] output_dict["best_span_str"].append(best_span_string) answers = metadata_entry.get("answers") if len(answers) > 0: self._per_instance_metrics(best_span_string, answers) output_dict["context_tokens"] = context_tokens return output_dict def get_metrics(self, reset: bool = False) -> Dict[str, float]: exact_match, f1_score = self._per_instance_metrics.get_metric(reset) return { "start_acc": self._span_start_accuracy.get_metric(reset), "end_acc": self._span_end_accuracy.get_metric(reset), "span_acc": self._span_accuracy.get_metric(reset), "per_instance_em": exact_match, "per_instance_f1": f1_score, }
from allennlp.data import Vocabulary from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder from allennlp.modules.token_embedders import Embedding import torch # This is what gets created by TextField.as_tensor with a SingleIdTokenIndexer; # see the exercises above. token_tensor = {'tokens': {'tokens': torch.LongTensor([1, 3, 2, 1, 4, 3])}} vocab = Vocabulary() vocab.add_tokens_to_namespace(['This', 'is', 'some', 'text', '.'], namespace='token_vocab') glove_file = 'https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz' # This is for embedding each token. embedding = Embedding(vocab=vocab, vocab_namespace='token_vocab', embedding_dim=50, pretrained_file=glove_file) embedder = BasicTextFieldEmbedder(token_embedders={'tokens': embedding}) embedded_tokens = embedder(token_tensor) print(embedded_tokens.size())
config["validation_cont"]["candidate_set_path"], config["validation_cont"]["candidate_set_from_to"][1]) # embedding layer (use pre-trained, but make it trainable as well) if config["token_embedder_type"] == "embedding": vocab = Vocabulary.from_files(config["vocab_directory"]) tokens_embedder = Embedding.from_params( vocab, Params({ "pretrained_file": config["pre_trained_embedding"], "embedding_dim": config["pre_trained_embedding_dim"], "trainable": config["train_embedding"], "padding_index": 0, "sparse": config["sparse_gradient_embedding"] })) word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder}) elif config["token_embedder_type"] == "fasttext": vocab = None #FastTextVocab(config["fasttext_vocab_mapping"]) tokens_embedder = FastTextEmbeddingBag( numpy.load(config["fasttext_weights"]), sparse=True, requires_grad=config["train_embedding"], mode=config["fasttext_merge_mode"]) word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder}, allow_unmatched_keys=True, embedder_to_indexer_map={ "tokens": { "tokens": "tokens", "offsets": "offsets" }
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder from allennlp.modules.token_embedders.bert_token_embedder import PretrainedBertEmbedder bert_embedder = PretrainedBertEmbedder( pretrained_model = "./biobert_v1.1_pubmed/weights.tar.gz", top_layer_only=True, requires_grad=False ) #print('Bert Model:', bert_embedder.bert_model.encoder.layer[11]) for param in bert_embedder.bert_model.encoder.layer[8:].parameters(): param.requires_grad = True word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder( token_embedders={"tokens": bert_embedder}, allow_unmatched_keys=True) # %% BERT_DIM = word_embeddings.get_output_dim() print('Bert dim:', BERT_DIM) class BertSentencePooler(Seq2VecEncoder): def __init__(self, vocab): super().__init__(vocab) def forward(self, embs:torch.tensor, mask:torch.tensor=None) -> torch.tensor: bert_out = embs[:, :, 0] return bert_out def get_output_dim(self) -> int:
# options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" # Use the 'Small' pre-trained model # options_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' # '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json') # weight_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' # '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5') elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) # Pass in the ElmoTokenEmbedder instance instead word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) # The dimension of the ELMo embedding will be 2 x [size of LSTM hidden states] elmo_embedding_dim = 1024 lstm = PytorchSeq2VecWrapper( torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, lstm, vocab) optimizer = optim.AdamW(model.parameters()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
def main(): # In order to use ELMo, each word in a sentence needs to be indexed with # an array of character IDs. elmo_token_indexer = ELMoTokenCharactersIndexer() reader = StanfordSentimentTreeBankDatasetReader( token_indexers={'tokens': elmo_token_indexer}) train_dataset = reader.read( 'data/stanfordSentimentTreebank/trees/train.txt') dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt') # Initialize the ELMo-based token embedder using a pre-trained file. # This takes a while if you run this script for the first time # Original # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" # Medium # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" # Use the 'Small' pre-trained model options_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' ) weight_file = ( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo' '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' ) elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) # Pass in the ElmoTokenEmbedder instance instead word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) # The dimension of the ELMo embedding will be 2 x [size of LSTM hidden states] elmo_embedding_dim = 256 lstm = PytorchSeq2VecWrapper( torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, lstm, vocab) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, num_epochs=20) trainer.train() tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!'] predictor = SentenceClassifierPredictor(model, dataset_reader=reader) logits = predictor.predict(tokens)['logits'] label_id = np.argmax(logits) print(model.vocab.get_token_from_index(label_id, 'labels'))
# This is what gets created by TextField.as_tensor with a SingleIdTokenIndexer; # Note that we added the batch dimension at the front. You choose the 'indexer1' # name when you configure your data processing code. token_tensor = {'indexer1': {'tokens': torch.LongTensor([[1, 3, 2, 9, 4, 3]])}} # You would typically get the number of embeddings here from the vocabulary; # if you use `allennlp train`, there is a separate process for instantiating the # Embedding object using the vocabulary that you don't need to worry about for # now. embedding = Embedding(num_embeddings=10, embedding_dim=3) # This 'indexer1' key must match the 'indexer1' key in the `token_tensor` above. # We use these names to align the TokenIndexers used in the data code with the # TokenEmbedders that do the work on the model side. embedder = BasicTextFieldEmbedder(token_embedders={'indexer1': embedding}) embedded_tokens = embedder(token_tensor) print("Using the TextFieldEmbedder:", embedded_tokens) # As we've said a few times, what's going on inside is that we match keys between # the token tensor and the token embedders, then pass the inner dictionary to the # token embedder. The above lines perform the following logic: embedded_tokens = embedding(**token_tensor['indexer1']) print("Using the Embedding directly:", embedded_tokens) # This is what gets created by TextField.as_tensor with a TokenCharactersIndexer # Note that we added the batch dimension at the front. Don't worry too much # about the magic 'token_characters' key - that is hard-coded to be produced # by the TokenCharactersIndexer, and accepted by TokenCharactersEncoder; # you don't have to produce those yourself in normal settings, it's done for you.
def main(args): ALL_DATASET_PATHS = get_all_dataset_paths( args.dataset_paths_file, args.dataset_path_prefix ) SELECTED_TASK_NAMES = args.task PROJECTION_DIM = args.proj_dim HIDDEN_DIM = args.hidden_dim # BIDIRECTIONAL=True # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM DROPOUT = args.dropout LR = args.lr WEIGHT_DECAY = args.weight_decay BATCH_SIZE = args.batch_size NUM_EPOCHS = args.epochs PATIENTCE = args.patience SERIALIZATION_DIR = args.model_dir CLEAN_MODEL_DIR = args.clean_model_dir CUDA_DEVICE = cuda_device(args.cuda) TEST_MODE = args.test_mode # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu") TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES] dataset_paths = { task_name: ALL_DATASET_PATHS[task_name] for task_name in SELECTED_TASK_NAMES } tag_namespace_hashing_fn = { tag_namespace: i for i, tag_namespace in enumerate(TASK_CONFIGS.keys()) }.get elmo_token_indexer = ELMoTokenCharactersIndexer() token_indexers = {"tokens": elmo_token_indexer} readers = { task.tag_namespace: ConLLDatasetReader( task.tag_namespace, token_indexers=token_indexers, tag_namespace_hashing_fn=tag_namespace_hashing_fn, ) for task in TASKS } elmo_embedder = ElmoTokenEmbedder( options_file, weight_file, requires_grad=False, dropout=DROPOUT, projection_dim=PROJECTION_DIM, ) # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3) # Pass in the ElmoTokenEmbedder instance instead word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim() # POS -> CHUNK -> NER task_suffixes = set( [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES] ) encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM) if not TEST_MODE: train_dataset = read_datasets(dataset_paths, readers, data_split="train") validation_dataset = read_datasets(dataset_paths, readers, data_split="dev") vocab = create_vocab([train_dataset, validation_dataset]) # Special case for CCG if "ccg" in task_suffixes or "pos" in task_suffixes: for task in TASKS: if task.task_type == "ccg": for tag in ["B-NOUN.SHAPE", "I-NOUN.PROCESS"]: vocab.add_token_to_namespace(tag, task.tag_namespace) if task.tag_namespace == "ud_pos": for tag in ["CONJ"]: vocab.add_token_to_namespace(tag, task.tag_namespace) else: vocab = Vocabulary.from_files(os.path.join(SERIALIZATION_DIR, "vocabulary")) # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM) model = MultiTaskCRFTagger(word_embeddings, encoders, vocab, TASKS) model = model.cuda(device=CUDA_DEVICE) if not TEST_MODE: iterator = CustomHomogeneousBatchIterator( partition_key="dataset", batch_size=BATCH_SIZE, cache_instances=True ) iterator.index_with(vocab) if CLEAN_MODEL_DIR: if os.path.exists(SERIALIZATION_DIR): logger.info(f"Deleting {SERIALIZATION_DIR}") shutil.rmtree(SERIALIZATION_DIR) logger.info(f"Creating {SERIALIZATION_DIR}") os.makedirs(SERIALIZATION_DIR) logger.info(f"Writing arguments to arguments.json in {SERIALIZATION_DIR}") with open(os.path.join(SERIALIZATION_DIR, "arguments.json"), "w+") as fp: json.dump(vars(args), fp, indent=2) logger.info(f"Writing vocabulary in {SERIALIZATION_DIR}") vocab.save_to_files(os.path.join(SERIALIZATION_DIR, "vocabulary")) # Use list to ensure each epoch is a full pass through the data combined_training_dataset = list(roundrobin_iterator(*train_dataset.values())) combined_validation_dataset = list( roundrobin_iterator(*validation_dataset.values()) ) # optimizer = optim.ASGD(model.parameters(), lr=0.01, t0=100, weight_decay=0.1) optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY) training_stats = [] trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=combined_training_dataset, validation_dataset=combined_validation_dataset, patience=PATIENTCE, num_epochs=NUM_EPOCHS, cuda_device=CUDA_DEVICE, serialization_dir=SERIALIZATION_DIR, # model_save_interval=600 ) stats = trainer.train() training_stats.append(stats) with open(os.path.join(SERIALIZATION_DIR, "training_stats.json"), "w+") as fp: json.dump(training_stats, fp, indent=2) else: model.load_state_dict(torch.load(os.path.join(SERIALIZATION_DIR, "best.th"))) model = model.cuda(device=CUDA_DEVICE) # Empty cache to ensure larger batch can be loaded for testing torch.cuda.empty_cache() test_filepaths = { task.tag_namespace: dataset_paths[task.tag_namespace]["test"] for task in TASKS } logger.info("Evaluating on test data") test_iterator = CustomHomogeneousBatchIterator( partition_key="dataset", batch_size=BATCH_SIZE * 2 ) test_iterator.index_with(vocab) model = model.eval() test_stats = evaluate_multiple_data( model, readers, test_iterator, test_filepaths, cuda_device=CUDA_DEVICE ) with open(os.path.join(SERIALIZATION_DIR, "test_stats.json"), "w+") as fp: json.dump(test_stats, fp, indent=2)