def run_config(config): params = Params(json.loads(config)) params_copy = params.duplicate() if "dataset_reader" in params: reader = DatasetReader.from_params(params.pop("dataset_reader")) else: raise RuntimeError("`dataset_reader` section is required") loader_params = params.pop("iterator") train_data_loader = DataIterator.from_params( reader=reader, data_path=params.pop("train_data_path"), params=loader_params.duplicate(), ) dev_data_loader = DataIterator.from_params( reader=reader, data_path=params.pop("validation_data_path"), params=loader_params, ) print("Building the vocabulary...") vocab = Vocabulary.from_instances(train_data_loader.iter_instances()) if "model" not in params: # 'dataset' mode — just preview the (first 10) instances print("Showing the first 10 instances:") for inst in train_data_loader.iter_instances(): print(inst) return None model = Model.from_params(vocab=vocab, params=params.pop("model")) train_data_loader.index_with(vocab) dev_data_loader.index_with(vocab) # set up a temporary, empty directory for serialization with tempfile.TemporaryDirectory() as serialization_dir: trainer = Trainer.from_params( model=model, serialization_dir=serialization_dir, data_loader=train_data_loader, validation_data_loader=dev_data_loader, params=params.pop("trainer"), ) trainer.train() return { "params": params_copy, "dataset_reader": reader, "vocab": vocab, "model": model, }
def test_ultra_fine_reader(self): reader = get_reader("entity") instances = ensure_list( reader.read('tests/fixtures/evaluation/ultra_fine/train.json')) # Check number of instances is correct self.assertEqual(len(instances), 2) # Check that first instance's tokens are correct tokens_0 = [x.text for x in instances[0]['tokens']] segments_0 = list(instances[0]['segment_ids'].array) actual = list(zip(tokens_0, segments_0)) expected = [('[CLS]', 0), ('the', 0), ('british', 0), ('information', 0), ('commissioner', 0), ("'s", 0), ('office', 0), ('invites', 0), ('[unused0]', 0), ('to', 0), ('locate', 0), ('its', 0), ('add', 0), ('##ress', 0), ('using', 0), ('google', 0), ('[UNK]', 0), ('.', 0), ('[SEP]', 0), ('web', 1), ('users', 1), ('[SEP]', 1)] self.assertListEqual(actual, expected) iterator = DataIterator.from_params(Params({"type": "basic"})) iterator.index_with(Vocabulary()) for batch in iterator(instances, num_epochs=1, shuffle=False): break expected_labels = [[0, 0, 0, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0]] self.assertEqual(batch['label_ids'].numpy().tolist(), expected_labels)
def write_for_official_eval(model_archive_file, test_file, output_file, label_ids_to_label): archive = load_archive(model_archive_file) model = archive.model reader = DatasetReader.from_params(archive.config['dataset_reader']) iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 4})) vocab = Vocabulary.from_params(archive.config['vocabulary']) iterator.index_with(vocab) model.cuda() model.eval() instances = reader.read(test_file) predictions = [] for batch in iterator(instances, num_epochs=1, shuffle=False): batch = move_to_device(batch, cuda_device=0) output = model(**batch) batch_labels = [ label_ids_to_label[i] for i in output['predictions'].cpu().numpy().tolist() ] predictions.extend(batch_labels) with open(output_file, 'w') as fout: for p in predictions: fout.write("{}\n".format(p))
def setUp(self): super().setUp() params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 }, }, "dataset_reader": { "type": "sequence_tagging" }, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), "validation_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), "iterator": { "type": "basic", "batch_size": 2 }, "trainer": { "cuda_device": -1, "num_epochs": 2, "optimizer": "adam" }, }) all_datasets = datasets_from_params(params) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), instances=(instance for dataset in all_datasets.values() for instance in dataset), ) model = Model.from_params(vocab=vocab, params=params.pop("model")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets["train"] trainer_params = params.pop("trainer") serialization_dir = os.path.join(self.TEST_DIR, "test_search_learning_rate") self.trainer = TrainerBase.from_params( model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, params=trainer_params, validation_data=None, validation_iterator=None, )
def ensure_model_can_train_save_and_load(self, param_file: str): save_dir = os.path.join(self.TEST_DIR, "save_and_load_test") archive_file = os.path.join(save_dir, "model.tar.gz") model = train_model_from_file(param_file, save_dir) loaded_model = load_archive(archive_file).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].numpy(), loaded_model.state_dict()[key].numpy(), err_msg=key) params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) iterator = DataIterator.from_params(params['iterator']) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) model_dataset.index_instances(model.vocab) model_batch_arrays = next(iterator(model_dataset, shuffle=False)) model_batch = arrays_to_variables(model_batch_arrays, for_training=False) loaded_dataset = reader.read(params['validation_data_path']) loaded_dataset.index_instances(loaded_model.vocab) loaded_batch_arrays = next(iterator(loaded_dataset, shuffle=False)) loaded_batch = arrays_to_variables(loaded_batch_arrays, for_training=False) # The datasets themselves should be identical. for key in model_batch.keys(): field = model_batch[key] if isinstance(field, dict): for subfield in field: self.assert_fields_equal(model_batch[key][subfield], loaded_batch[key][subfield], tolerance=1e-6, name=key + '.' + subfield) else: self.assert_fields_equal(model_batch[key], loaded_batch[key], 1e-6, key) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() model_predictions = model.forward(**model_batch) loaded_model_predictions = loaded_model.forward(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], tolerance=1e-4, name=key) return model, loaded_model
def main(args): params = Params.from_file(args.config_path) stdout_handler = prepare_global_logging(args.output_dir, False) prepare_environment(params) reader = DatasetReader.from_params(params["dataset_reader"]) train_dataset = reader.read(params.pop("train_data_path", None)) valid_dataset = reader.read(params.pop("validation_data_path", None)) test_data_path = params.pop("test_data_path", None) if test_data_path: test_dataset = reader.read(test_data_path) vocab = Vocabulary.from_instances(train_dataset + valid_dataset + test_dataset) else: test_dataset = None vocab = Vocabulary.from_instances(train_dataset + valid_dataset) model_params = params.pop("model", None) model = Model.from_params(model_params.duplicate(), vocab=vocab) vocab.save_to_files(os.path.join(args.output_dir, "vocabulary")) # copy config file with open(args.config_path, "r", encoding="utf-8") as f_in: with open(os.path.join(args.output_dir, "config.json"), "w", encoding="utf-8") as f_out: f_out.write(f_in.read()) iterator = DataIterator.from_params(params.pop("iterator", None)) iterator.index_with(vocab) trainer_params = params.pop("trainer", None) trainer = Trainer.from_params(model=model, serialization_dir=args.output_dir, iterator=iterator, train_data=train_dataset, validation_data=valid_dataset, params=trainer_params.duplicate()) trainer.train() # evaluate on the test set if test_dataset: logging.info("Evaluating on the test set") import torch # import here to ensure the republication of the experiment model.load_state_dict( torch.load(os.path.join(args.output_dir, "best.th"))) test_metrics = evaluate(model, test_dataset, iterator, cuda_device=trainer_params.pop( "cuda_device", 0), batch_weight_key=None) logging.info(f"Metrics on the test set: {test_metrics}") with open(os.path.join(args.output_dir, "test_metrics.txt"), "w", encoding="utf-8") as f_out: f_out.write(f"Metrics on the test set: {test_metrics}") cleanup_global_logging(stdout_handler)
def main(config_file): config = Params.from_file(config_file) dataset_reader = DatasetReader.from_params(config['dataset_reader']) iterator_params = config['iterator'] iterator_keys = list(iterator_params.keys()) for key in iterator_keys: if key != 'batch_size': del iterator_params[key] iterator_params['type'] = 'basic' iterator = DataIterator.from_params(iterator_params) evaluation_data_path = config['validation_data_path'] expected_version = '1.1' with open(evaluation_data_path) as dataset_file: dataset_json = json.load(dataset_file) if (dataset_json['version'] != expected_version): print('Evaluation expects v-' + expected_version + ', but got dataset with v-' + dataset_json['version'], file=sys.stderr) official_script_dataset = dataset_json['data'] cuda_device = 0 squad_eval.verbosity = 1 model = Model.load(config, cuda_device=cuda_device) # Load the evaluation data print("Reading evaluation data from %s" % evaluation_data_path) dataset = dataset_reader.read(evaluation_data_path) dataset.index_instances(model._vocab) model.eval() generator = iterator(dataset, num_epochs=1, shuffle=False) print("Predicting best spans for the evaluation data") best_spans = [] result_dict = {} for batch in tqdm.tqdm(generator): tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False) result = model.forward(**tensor_batch) best_span_tensor = result['best_span'] for i in range(best_span_tensor.size(0)): best_spans.append(best_span_tensor[i].data.cpu().tolist()) for best_span, instance in zip(best_spans, dataset.instances): span_tokens = instance.fields['passage'].tokens[ best_span[0]:best_span[1]] # We have to do some hacks to get from our tokens back to the original passage text, so # that our answers get scored correctly. This could be made much easier if we kept around # the character offset in the original text when we tokenize things. span_text = fix_span_text(span_tokens, instance.metadata['original_passage']) question_id = instance.metadata['id'] result_dict[question_id] = span_text metrics = model.get_metrics() official_result = squad_eval.evaluate(official_script_dataset, result_dict) print("Our model's metrics:", metrics) print("Official result:", official_result)
def test_wic_reader_entity_markers(self): reader_params = Params({ "type": "wic", "entity_markers": True, "tokenizer_and_candidate_generator": { "type": "bert_tokenizer_and_candidate_generator", "entity_candidate_generators": { "wordnet": { "type": "wordnet_mention_generator", "entity_file": "tests/fixtures/wordnet/entities_fixture.jsonl" } }, "entity_indexers": { "wordnet": { "type": "characters_tokenizer", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" }, }, "namespace": "entity" } }, "bert_model_type": "tests/fixtures/evaluation/wic/vocab_entity_markers.txt", "do_lower_case": True, }, }) reader = DatasetReader.from_params(reader_params) instances = reader.read(FIXTURES + '/train') iterator = DataIterator.from_params(Params({"type": "basic"})) iterator.index_with(Vocabulary()) for batch in iterator(instances, num_epochs=1, shuffle=False): break self.assertTrue(len(batch['label_ids']) == 5) self.assertEqual(batch['index_a'][0].item(), 3) self.assertEqual(batch['index_b'][0].item(), 12) instance_0_text = [ token.text for token in instances[0].fields['tokens'].tokens ] expected_instance_0_text = [ '[CLS]', '[UNK]', '[UNK]', '[e1start]', '[UNK]', '[e1end]', '[UNK]', '[UNK]', '[UNK]', '.', '[SEP]', '[UNK]', '[e2start]', '[UNK]', '[e2end]', '[UNK]', 'over', '[UNK]', '.', '[SEP]' ] self.assertEqual(instance_0_text, expected_instance_0_text) self.assertEqual(instance_0_text[3], '[e1start]') self.assertEqual(instance_0_text[12], '[e2start]')
def test_reader(self): reader = get_reader(masked_lm_prob=0.15) np.random.seed(5) instances = reader.read("tests/fixtures/bert_pretraining/shard1.txt") vocab = Vocabulary.from_params(Params({ "directory_path": "tests/fixtures/bert/vocab_dir_with_entities_for_tokenizer_and_generator" })) iterator = DataIterator.from_params(Params({"type": "basic"})) iterator.index_with(vocab) for batch in iterator(instances, num_epochs=1, shuffle=False): break actual_tokens_ids = batch['tokens']['tokens'] expected_tokens_ids = torch.tensor( [[16, 18, 19, 20, 1, 19, 21, 13, 17, 21, 3, 4, 12, 13, 17], [16, 1, 13, 17, 21, 1, 1, 13, 17, 0, 0, 0, 0, 0, 0]]) self.assertEqual(actual_tokens_ids.tolist(), expected_tokens_ids.tolist()) actual_entities = batch['candidates']['wordnet']['candidate_entities']['ids'] expected_entities = torch.tensor( [[[29, 30], [31, 0], [31, 0]], [[ 0, 0], [ 0, 0], [ 0, 0]]]) self.assertEqual(actual_entities.tolist(), expected_entities.tolist()) expected_spans = torch.tensor( [[[ 1, 3], [ 2, 3], [ 5, 6]], [[-1, -1], [-1, -1], [-1, -1]]]) actual_spans = batch['candidates']['wordnet']['candidate_spans'] self.assertEqual(actual_spans.tolist(), expected_spans.tolist()) expected_lm_labels = torch.tensor( [[ 0, 0, 0, 0, 0, 0, 20, 0, 0, 2, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) actual_lm_labels = batch['lm_label_ids']['lm_labels'] self.assertEqual(actual_lm_labels.tolist(), expected_lm_labels.tolist()) expected_segment_ids = torch.tensor( [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]) self.assertEqual(batch['segment_ids'].tolist(), expected_segment_ids.tolist()) self.assertTrue(batch['segment_ids'].dtype == torch.long)
def evaluate_from_args(args: argparse.Namespace): # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO) # Load from archive archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) evaluation_data_path = args.input_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) embedding_sources: Dict[str, str] = (json.loads(args.embedding_sources_mapping) if args.embedding_sources_mapping else {}) if args.extend_vocab: logger.info("Vocabulary is being extended with test instances.") model.vocab.extend_from_instances(Params({}), instances=instances) model.extend_embedder_vocab(embedding_sources) iterator_params = config.pop("validation_iterator", None) if iterator_params is None: iterator_params = config.pop("iterator") iterator = DataIterator.from_params(iterator_params) iterator.index_with(model.vocab) csv_writer = csv.writer(args.output_file) keys = None for instance in instances: metrics = evaluate(model, [instance], iterator, args.cuda_device, args.batch_weight_key) if keys is None: keys = sorted(metrics.keys()) csv_writer.writerow(['instance_id', *keys]) instance_id = instance.fields['metadata']['id'] values = [metrics[key] for key in keys] csv_writer.writerow([instance_id, *values])
def test_sample(self): generator_params = Params.from_file( "kglm/tests/fixtures/training_config/kglm.json") params = Params.from_file(self.param_file) dataset_file = "kglm/tests/fixtures/enhanced-wikitext-test/train.jsonl" # Need instances from 'generative' reader! reader_params = generator_params['dataset_reader'] reader = DatasetReader.from_params(reader_params) instances = list(reader.read(dataset_file)) iterator = DataIterator.from_params(generator_params['iterator']) iterator.index_with(self.model.vocab) batch, _ = next(iterator(instances, shuffle=False)) self.model.sample(**batch)
def setUp(self): super().setUp() params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "cuda_device": -1, "num_epochs": 2, "optimizer": "adam" } }) all_datasets = datasets_from_params(params) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for dataset in all_datasets.values() for instance in dataset) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") serialization_dir = os.path.join(self.TEST_DIR, 'test_search_learning_rate') self.trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, params=trainer_params, validation_data=None, validation_iterator=None)
def run_evaluation(evaluation_file, model_archive, random_candidates=False): archive = load_archive(model_archive) model = archive.model vocab = model.vocab params = archive.config model.multitask = False model.multitask_kg = False model.cuda() model.eval() for p in model.parameters(): p.requires_grad_(False) reader_params = params.pop('dataset_reader') if reader_params['type'] == 'multitask_reader': reader_params = reader_params['dataset_readers']['language_modeling'] if random_candidates: for k, v in reader_params['base_reader'][ 'tokenizer_and_candidate_generator'][ 'entity_candidate_generators'].items(): v['random_candidates'] = True reader = DatasetReader.from_params(Params(reader_params)) iterator = DataIterator.from_params( Params({ "type": "self_attn_bucket", "batch_size_schedule": "base-11gb-fp32", "iterator": { "type": "bucket", "batch_size": 32, "sorting_keys": [["tokens", "num_tokens"]], "max_instances_in_memory": 2500, } })) iterator.index_with(vocab) instances = reader.read(evaluation_file) for batch_no, batch in enumerate( tqdm.tqdm(iterator(instances, num_epochs=1))): b = move_to_device(batch, 0) loss = model(**b) if batch_no % 100 == 0: print(model.get_metrics()) print(model.get_metrics())
def __init__(self, model_archive, batch_size=32, masking_strategy=None, wordnet_entity_file=None, vocab_dir=None): # get bert_tokenizer_and_candidate_generator if os.path.isdir(model_archive): config = Params.from_file( os.path.join(model_archive, 'config.json')) else: config = _extract_config_from_archive(cached_path(model_archive)) # look for the bert_tokenizers and candidate_generator candidate_generator_params = _find_key( config['dataset_reader'].as_dict(), 'tokenizer_and_candidate_generator') if wordnet_entity_file is not None: candidate_generator_params['entity_candidate_generators'][ 'wordnet']['entity_file'] = wordnet_entity_file self.tokenizer_and_candidate_generator = TokenizerAndCandidateGenerator.\ from_params(Params(candidate_generator_params)) self.tokenizer_and_candidate_generator.whitespace_tokenize = False assert masking_strategy is None or masking_strategy == 'full_mask' self.masking_strategy = masking_strategy # need bert_tokenizer_and_candidate_generator if vocab_dir is not None: vocab_params = Params({"directory_path": vocab_dir}) else: vocab_params = config['vocabulary'] self.vocab = Vocabulary.from_params(vocab_params) self.iterator = DataIterator.from_params( Params({ "type": "basic", "batch_size": batch_size })) self.iterator.index_with(self.vocab)
def test_wic_reader(self): reader_params = Params({ "type": "wic", "tokenizer_and_candidate_generator": { "type": "bert_tokenizer_and_candidate_generator", "entity_candidate_generators": { "wordnet": { "type": "wordnet_mention_generator", "entity_file": "tests/fixtures/wordnet/entities_fixture.jsonl" } }, "entity_indexers": { "wordnet": { "type": "characters_tokenizer", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" }, }, "namespace": "entity" } }, "bert_model_type": "tests/fixtures/bert/vocab.txt", "do_lower_case": True, }, }) reader = DatasetReader.from_params(reader_params) instances = reader.read(FIXTURES + '/train') iterator = DataIterator.from_params(Params({"type": "basic"})) iterator.index_with(Vocabulary()) for batch in iterator(instances, num_epochs=1, shuffle=False): break self.assertTrue(len(batch['label_ids']) == 5) self.assertEqual(batch['index_a'][0].item(), 3) self.assertEqual(batch['index_b'][0].item(), 10)
def get_wsd_reader(is_training, use_bert_indexer=False, wordnet_entity_file=None): if wordnet_entity_file is None: wordnet_entity_file = "tests/fixtures/wordnet/entities_cat_hat.jsonl" if use_bert_indexer: bert_fixtures = get_bert_test_fixture() indexer_params = bert_fixtures["indexer_params"] else: indexer_params = {"type": "single_id", "lowercase_tokens": True} reader_params = { "type": "wordnet_fine_grained", "wordnet_entity_file": wordnet_entity_file, "token_indexers": { "tokens": indexer_params, }, "entity_indexer": { "type": "characters_tokenizer", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" }, }, "namespace": "entity" }, "is_training": is_training, "use_surface_form": False } reader = DatasetReader.from_params(Params(reader_params)) vocab_params = { "directory_path": "tests/fixtures/wordnet/cat_hat_vocabdir" } vocab = Vocabulary.from_params(Params(vocab_params)) iterator = DataIterator.from_params(Params({"type": "basic"})) iterator.index_with(vocab) return reader, vocab, iterator
def test_sample(self): generator_params = Params.from_file( "kglm/tests/fixtures/training_config/kglm.no-shortlist.json") params = Params.from_file(self.param_file) dataset_file = "kglm/tests/fixtures/enhanced-wikitext-test/train.jsonl" # Need instances from 'generative' reader! reader_params = generator_params['dataset_reader'] reader_params['mode'] = 'generative' reader = DatasetReader.from_params(reader_params) instances = list(reader.read(dataset_file)) iterator = DataIterator.from_params(generator_params['iterator']) iterator.index_with(self.model.vocab) batch, _ = next(iterator(instances, shuffle=False)) # Samples should match (we'll test by comparing logp) torch.manual_seed(123) logp1 = self.model.sample(**batch).get('logp', None) torch.manual_seed(123) logp2 = self.model.sample(**batch).get('logp', None)
def get_wic_batch(): fixtures = 'tests/fixtures/evaluation/wic' reader_params = Params({ "type": "wic", "tokenizer_and_candidate_generator": { "type": "bert_tokenizer_and_candidate_generator", "entity_candidate_generators": { "wordnet": { "type": "wordnet_mention_generator", "entity_file": "tests/fixtures/wordnet/entities_fixture.jsonl" } }, "entity_indexers": { "wordnet": { "type": "characters_tokenizer", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" }, }, "namespace": "entity" } }, "bert_model_type": "tests/fixtures/bert/vocab.txt", "do_lower_case": True, }, }) reader = DatasetReader.from_params(reader_params) instances = reader.read(fixtures + '/train') iterator = DataIterator.from_params(Params({"type": "basic"})) iterator.index_with(Vocabulary()) for batch in iterator(instances, num_epochs=1, shuffle=False): break return batch
def knowbert_fill2(sentences, model, batcher, vocab, mask_start=0, mask_end=0, config_file=None, top=10): iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 32})) config = Params.from_file(config_file) vocab_params = config['vocabulary'] iterator.index_with(Vocabulary.from_params(vocab_params)) instances = [] for sent in sentences: token_candidates = batcher.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sent.replace('[MASK]', ' [MASK] ')) masked_tokens = token_candidates['tokens'].copy() for i in range(mask_start, mask_end): masked_tokens[i] = '[MASK]' token_candidates['tokens'] = masked_tokens # mask out the entity candidates candidates = token_candidates['candidates'] for candidate_key in candidates.keys(): indices_to_mask = [] for k, candidate_span in enumerate(candidates[candidate_key]['candidate_spans']): if (candidate_span[0] >= mask_start and candidate_span[0] <= mask_end-1) or (candidate_span[1] >= mask_start and candidate_span[1] <= mask_end-1): indices_to_mask.append(k) for ind in indices_to_mask: candidates[candidate_key]['candidate_entities'][ind] = ['@@MASK@@'] candidates[candidate_key]['candidate_entity_priors'][ind] = [1.0] if len(indices_to_mask) == 0: candidates[candidate_key]['candidate_spans'].append([mask_start, mask_end-1]) candidates[candidate_key]['candidate_entities'].append(['@@MASK@@']) candidates[candidate_key]['candidate_entity_priors'].append([1.0]) candidates[candidate_key]['candidate_segment_ids'].append(0) fields = batcher.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates) instances.append(Instance(fields)) for batch in iterator(instances, num_epochs=1, shuffle=False): print(batch['tokens']['tokens']) model_output = model(**batch) print([vocab[w] for w in batch['tokens']['tokens'][0].numpy()]) logits, _ = model.pretraining_heads(model_output['contextual_embeddings'], model_output['pooled_output']) log_probs = F.log_softmax(logits, dim=-1).cpu() for mask_ind in range(mask_start, mask_end): topk = torch.topk(log_probs[0, mask_ind], top, -1)[1] print([vocab[t.item()] for t in topk])
def ensure_model_can_train_save_and_load( self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1, gradients_to_ignore: Set[str] = None, overrides: str = ""): """ Parameters ---------- param_file : ``str`` Path to a training configuration file that we will use to train the model for this test. tolerance : ``float``, optional (default=1e-4) When comparing model predictions between the originally-trained model and the model after saving and loading, we will use this tolerance value (passed as ``rtol`` to ``numpy.testing.assert_allclose``). cuda_device : ``int``, optional (default=-1) The device to run the test on. gradients_to_ignore : ``Set[str]``, optional (default=None) This test runs a gradient check to make sure that we're actually computing gradients for all of the parameters in the model. If you really want to ignore certain parameters when doing that check, you can pass their names here. This is not recommended unless you're `really` sure you don't need to have non-zero gradients for those parameters (e.g., some of the beam search / state machine models have infrequently-used parameters that are hard to force the model to use in a small test). overrides : ``str``, optional (default = "") A JSON string that we will use to override values in the input parameter file. """ save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir, overrides=overrides) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(param_file) # Need to duplicate params because DatasetReader.from_params will consume. reader_params = params['dataset_reader'] reader_params2 = Params(copy.deepcopy(reader_params.as_dict())) reader = DatasetReader.from_params(reader_params) reader2 = DatasetReader.from_params(reader_params2) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. seed_params = Params({ "random_seed": 5, "numpy_seed": 5, "pytorch_seed": 5 }) prepare_environment(seed_params) model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False)) seed_params = Params({ "random_seed": 5, "numpy_seed": 5, "pytorch_seed": 5 }) prepare_environment(seed_params) loaded_dataset = reader2.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() # import pdb; pdb.set_trace() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
def run_evaluation(evaluation_file, model_archive_file, is_wordnet_and_wiki=False): archive = load_archive(model_archive_file) params = archive.config vocab = Vocabulary.from_params(params.pop('vocabulary')) model = archive.model #model.cuda() model.eval() if is_wordnet_and_wiki: reader_params = Params({ "type": "aida_wiki_linking", "entity_disambiguation_only": False, "entity_indexer": { "type": "characters_tokenizer", "namespace": "entity_wiki", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" } } }, "extra_candidate_generators": { "wordnet": { "type": "wordnet_mention_generator", "entity_file": "s3://allennlp/knowbert/wordnet/entities.jsonl" } }, "should_remap_span_indices": True, "token_indexers": { "tokens": { "type": "bert-pretrained", "do_lowercase": True, "max_pieces": 512, "pretrained_model": "bert-base-uncased", "use_starting_offsets": True, } } }) else: reader_params = Params({ "type": "aida_wiki_linking", "entity_disambiguation_only": False, "token_indexers": { "tokens": { "type": "bert-pretrained", "pretrained_model": "bert-base-uncased", "do_lowercase": True, "use_starting_offsets": True, "max_pieces": 512, }, }, "entity_indexer": { "type": "characters_tokenizer", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" }, }, "namespace": "entity", }, "should_remap_span_indices": True, }) if is_wordnet_and_wiki: cg_params = Params({ "type": "bert_tokenizer_and_candidate_generator", "bert_model_type": "bert-base-uncased", "do_lower_case": True, "entity_candidate_generators": { "wordnet": { "type": "wordnet_mention_generator", "entity_file": "s3://allennlp/knowbert/wordnet/entities.jsonl" } }, "entity_indexers": { "wordnet": { "type": "characters_tokenizer", "namespace": "entity_wordnet", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" } } } } }) candidate_generator = TokenizerAndCandidateGenerator.from_params( cg_params) reader = DatasetReader.from_params(Params(reader_params)) iterator = DataIterator.from_params( Params({ "type": "basic", "batch_size": 16 })) iterator.index_with(vocab) instances = reader.read(evaluation_file) for batch_no, batch in enumerate( iterator(instances, shuffle=False, num_epochs=1)): b = move_to_device(batch, -1) b['candidates'] = { 'wiki': { 'candidate_entities': b.pop('candidate_entities'), 'candidate_entity_priors': b.pop('candidate_entity_prior'), 'candidate_segment_ids': b.pop('candidate_segment_ids'), 'candidate_spans': b.pop('candidate_spans') } } gold_entities = b.pop('gold_entities') b['gold_entities'] = {'wiki': gold_entities} if is_wordnet_and_wiki: extra_candidates = b.pop('extra_candidates') seq_len = b['tokens']['tokens'].shape[1] bbb = [] for e in extra_candidates: for k in e.keys(): e[k]['candidate_segment_ids'] = [0] * len( e[k]['candidate_spans']) ee = { 'tokens': ['[CLS]'] * seq_len, 'segment_ids': [0] * seq_len, 'candidates': e } ee_fields = candidate_generator.convert_tokens_candidates_to_fields( ee) bbb.append(Instance(ee_fields)) eb = Batch(bbb) eb.index_instances(vocab) padding_lengths = eb.get_padding_lengths() tensor_dict = eb.as_tensor_dict(padding_lengths) b['candidates'].update(tensor_dict['candidates']) bb = move_to_device(b, -1) else: bb = b loss = model(**bb) if batch_no % 100 == 0: print(model.get_metrics()) print(model.get_metrics())
def ensure_model_can_train_save_and_load(self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1): save_dir = os.path.join(self.TEST_DIR, "save_and_load_test") archive_file = os.path.join(save_dir, "model.tar.gz") model = train_model_from_file(param_file, save_dir) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device)) loaded_dataset = reader.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
def predict(archive_folder, span_file, cluster_file, output_file, cuda_device): combine_span_and_cluster_file(span_file, cluster_file) test_file = 'tmp_relation_42424242.jsonl' relation_threshold = json.load( open(archive_folder + '/metrics.json'))['best_validation__n_ary_rel_global_threshold'] print(relation_threshold) import_submodules("scirex") logging.info("Loading Model from %s", archive_folder) archive_file = os.path.join(archive_folder, "model.tar.gz") archive = load_archive(archive_file, cuda_device) model = archive.model model.eval() model.prediction_mode = True config = archive.config.duplicate() dataset_reader_params = config["dataset_reader"] dataset_reader = DatasetReader.from_params(dataset_reader_params) dataset_reader.prediction_mode = True instances = dataset_reader.read(test_file) for instance in instances: batch = Batch([instance]) batch.index_instances(model.vocab) data_iterator = DataIterator.from_params(config["validation_iterator"]) iterator = data_iterator(instances, num_epochs=1, shuffle=False) with open(output_file, "w") as f: documents = {} for batch in tqdm(iterator): with torch.no_grad(): batch = nn_util.move_to_device(batch, cuda_device) output_res = model.decode_relations(batch) n_ary_relations = output_res['n_ary_relation'] predicted_relations, scores = n_ary_relations[ 'candidates'], n_ary_relations['scores'] try: metadata = output_res['n_ary_relation']['metadata'][0] except (KeyError, IndexError): continue doc_id = metadata['doc_id'] coref_key_map = { k: i for i, k in metadata['document_metadata'] ['cluster_name_to_id'].items() } for i, rel in enumerate(predicted_relations): predicted_relations[i] = tuple([ coref_key_map[k] if k in coref_key_map else None for k in rel ]) if doc_id not in documents: documents[doc_id] = { 'predicted_relations': [], 'doc_id': doc_id } scores_ = list(scores.ravel()) if not scores_: warnings.warn(f"no relation scores defined for {doc_id}") continue label = [1 if x > relation_threshold else 0 for x in scores_] if all(l == 0 for l in label): decoding_mode = os.environ.get("SCIREX_RELATION_DECODING") if decoding_mode == "report_single_most_likely": label[scores.argmax()] = 1 elif decoding_mode == "report_probabilistically": idxs_sorted_by_score = sorted( range(len(label)), key=lambda i: scores[i], reverse=True # highest score first ) possible_decoding_idxs = \ [idxs_sorted_by_score[:i] for i in range(1, 11)] # assuming that >10 relationships would never happen def score_decoding(candidate_idxs): """likelihood function for a geometric distribution fit to the training distribution of number-of-relationships-per-document :param candidate_idxs (List[int]): a list of idxs that represents a relationship distribution :return: likelihood that distribution """ score_from_n_relationships = st.geom.pmf( len(candidate_idxs), 0.4046692607003891 # MLE from training distribution, i.e.: 1 / (1 + E[X]) ) score_from_indiv_relationships = scores[candidate_idxs] return score_from_n_relationships * np.prod( score_from_indiv_relationships) best_decoding_idxs = max(possible_decoding_idxs, key=score_decoding) for idx in best_decoding_idxs: label[idx] = 1 scores = [round(float(x), 4) for x in list(scores.ravel())] documents[doc_id]['predicted_relations'] += list( zip(predicted_relations, scores, label)) for d in documents.values(): predicted_relations = {} for r, s, l in d['predicted_relations']: r = tuple(r) if r not in predicted_relations or predicted_relations[r][ 0] < s: predicted_relations[r] = (s, l) d['predicted_relations'] = [ (r, s, l) for r, (s, l) in predicted_relations.items() ] f.write("\n".join([json.dumps(x) for x in documents.values()]))
def train_model(params: Union[Params, Dict[str, Any]], cuda_device: int, serialization_dir: str, filtering: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ SimpleRandom.set_seeds() os.makedirs(serialization_dir, exist_ok=True) try: sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout, True) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr, True) # type: ignore except TypeError: sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. ds_params = params.pop('dataset_reader', {}) read_settings = ds_params.pop('read_settings', {}) dataset_reader = FEVERReader.from_params(ds_params) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read( train_data_path, include_metadata=True, replace_with_gold=read_settings.pop('replace_gold', False), pad_with_nearest=read_settings.pop('pad_with_nearest', 0)) validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path, include_metadata=True) else: validation_data = None vocab_params = params.pop("vocabulary", {}) dataset = None print(dict(vocab_params), 'directory_path' not in vocab_params) assert ('directory_path' in vocab_params) vocab = Vocabulary.from_params(vocab_params, dataset) print(vocab) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model
def forward_on_instances( model, instances: Iterable[Instance], data_iterator: DataIterator) -> List[Dict[str, numpy.ndarray]]: """ Basically a copy of Model.forward_on_instances, but also takes a DataIterator in order to be more efficient. Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. Parameters ---------- model : AllenNLP model, required The model to run. instances : List[Instance], required The instances to run the model on. data_iterator: DataIterator, required The DataIterator used for going over the data (e.g. BucketIterator) Returns ------- A list of the models output for each instance. """ data_iterator.index_with(model.vocab) with torch.no_grad(): return_val: List[Dict[str, numpy.ndarray]] = [] cuda_device = model._get_prediction_device() for dataset in data_iterator._create_batches(instances, shuffle=False): batch_size = len(dataset.instances) dataset.index_instances(model.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) outputs = model.decode(model(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.Tensor): # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable. # This occurs with batch size 1, because we still want to include the loss in that case. if output.dim() == 0: output = output.unsqueeze(0) if output.size(0) != batch_size: model._maybe_warn_for_unseparable_batches(name) continue output = output.detach().cpu().numpy() elif len(output) != batch_size: model._maybe_warn_for_unseparable_batches(name) continue for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return_val.extend(instance_separated_output) return return_val
def ensure_model_can_train_save_and_load(self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1, gradients_to_ignore: Set[str] = None, overrides: str = ""): """ Parameters ---------- param_file : ``str`` Path to a training configuration file that we will use to train the model for this test. tolerance : ``float``, optional (default=1e-4) When comparing model predictions between the originally-trained model and the model after saving and loading, we will use this tolerance value (passed as ``rtol`` to ``numpy.testing.assert_allclose``). cuda_device : ``int``, optional (default=-1) The device to run the test on. gradients_to_ignore : ``Set[str]``, optional (default=None) This test runs a gradient check to make sure that we're actually computing gradients for all of the parameters in the model. If you really want to ignore certain parameters when doing that check, you can pass their names here. This is not recommended unless you're `really` sure you don't need to have non-zero gradients for those parameters (e.g., some of the beam search / state machine models have infrequently-used parameters that are hard to force the model to use in a small test). overrides : ``str``, optional (default = "") A JSON string that we will use to override values in the input parameter file. """ save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir, overrides=overrides) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(param_file) reader = DatasetReader.from_params(params['dataset_reader']) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False)) loaded_dataset = reader.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
def find_learning_rate_model(params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` Parameters ---------- trainer: :class:`~allennlp.common.registrable.Registrable` params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr: ``float`` Learning rate to start the search. end_lr: ``float`` Learning rate upto which search is done. num_batches: ``int`` Number of mini-batches to run Learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force: ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is ' f'not empty.') else: os.makedirs(serialization_dir, exist_ok=True) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, params=trainer_params, validation_data=None, validation_iterator=None) logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.') learning_rates, losses = search_learning_rate(trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor) logger.info(f'Finished learning rate search.') losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
def find_learning_rate_model( params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False, ) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` # Parameters params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr : ``float`` Learning rate to start the search. end_lr : ``float`` Learning rate upto which search is done. num_batches : ``int`` Number of mini-batches to run Learning rate finder. linear_steps : ``bool`` Increase learning rate linearly if False exponentially. stopping_factor : ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force : ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ create_serialization_dir(params, serialization_dir, recover=False, force=force) prepare_environment(params) cuda_device = params.params.get("trainer").get("cuda_device", -1) check_for_gpu(cuda_device) distributed_params = params.params.get("distributed", None) # See https://github.com/allenai/allennlp/issues/3658 assert not distributed_params, "find-lr is not compatible with DistributedDataParallel." all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), instances=(instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation), ) model = Model.from_params(vocab=vocab, params=params.pop("model")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets["train"] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer_choice = trainer_params.pop("type", "default") if trainer_choice != "default": raise ConfigurationError( "currently find-learning-rate only works with the default Trainer") trainer: Trainer = TrainerBase.from_params( # type: ignore model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=None, params=trainer_params, validation_iterator=None, ) logger.info( f"Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations." ) learning_rates, losses = search_learning_rate( trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor, ) logger.info(f"Finished learning rate search.") losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, "lr-losses.png"))
def find_learning_rate_model(params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr: ``float`` Learning rate to start the search. end_lr: ``float`` Learning rate upto which search is done. num_batches: ``int`` Number of mini-batches to run Learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force: ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is ' f'not empty.') else: os.makedirs(serialization_dir, exist_ok=True) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer_choice = trainer_params.pop("type", "default") if trainer_choice != "default": raise ConfigurationError("currently find-learning-rate only works with the default Trainer") trainer = Trainer.from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=None, params=trainer_params, validation_iterator=None) logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.') learning_rates, losses = search_learning_rate(trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor) logger.info(f'Finished learning rate search.') losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
def ensure_model_can_train_save_and_load(self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1): save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(param_file) reader = DatasetReader.from_params(params['dataset_reader']) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device)) loaded_dataset = reader.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]], cuda_device: int, serialization_dir: str, filtering: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ SimpleRandom.set_seeds() os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. ds_params = params.pop('dataset_reader', {}) dataset_reader = FEVERReader(db, sentence_level=ds_params.pop( "sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {})), filtering=filtering) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), Dataset([ instance for dataset in all_datasets for instance in dataset.instances ])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model
def predict(archive_folder, span_prediction_file, output_file, cuda_device): ''' span_prediction_file (jsonl) needs atleast three fields - doc_id, words: List[str], field: List[Tuple[start_index, end_index, type]] Return output_file (jsonl) - { 'doc_id' : str, 'pairwise_coreference_scores' : List[(s_1, e_1), (s_2, e_2), float (3 sig. digits) in [0, 1]] } ''' import_submodules("scirex") archive_file = os.path.join(archive_folder, "model.tar.gz") archive = load_archive(archive_file, cuda_device) model = archive.model model.eval() config = archive.config.duplicate() dataset_reader_params = config["dataset_reader"] dataset_reader_params.pop('type') dataset_reader = ScirexCoreferenceEvalReader.from_params( params=dataset_reader_params, field="ner") instances = dataset_reader.read(span_prediction_file) batch = Batch(instances) batch.index_instances(model.vocab) config['iterator'].pop('batch_size') data_iterator = DataIterator.from_params(config["iterator"], batch_size=1000) iterator = data_iterator(instances, num_epochs=1, shuffle=False) with open(output_file, "w") as f: documents = {} for batch in tqdm(iterator): with torch.no_grad(): batch = nn_util.move_to_device(batch, cuda_device) # Put on GPU. pred = model(**batch) decoded = model.decode(pred) metadata = decoded["metadata"] label_prob: List[float] = [ float(x) for x in decoded["label_probs"] ] doc_ids: List[str] = [m["doc_id"] for m in metadata] span_premise = [m["span_premise"] for m in metadata] span_hypothesis = [m["span_hypothesis"] for m in metadata] fields = [m["field"] for m in metadata] assert len(set(fields)) == 1, breakpoint() for doc_id, span_p, span_h, p in zip(doc_ids, span_premise, span_hypothesis, label_prob): if doc_id not in documents: documents[doc_id] = { "doc_id": doc_id, "pairwise_coreference_scores": [] } documents[doc_id]["pairwise_coreference_scores"].append( ((span_p[0], span_p[1]), (span_h[0], span_h[1]), round(p, 4))) f.write("\n".join([json.dumps(x) for x in documents.values()]))
def from_partial_objects( cls, serialization_dir: str, local_rank: int, batch_weight_key: str, dataset_reader: DatasetReader, train_data_path: str, model: Lazy[Model], iterator: DataIterator, trainer: Lazy[TrainerBase], vocabulary: Lazy[Vocabulary] = None, datasets_for_vocab_creation: List[str] = None, validation_dataset_reader: DatasetReader = None, validation_data_path: str = None, validation_iterator: DataIterator = None, test_data_path: str = None, evaluate_on_test: bool = False, ) -> "TrainModel": """ This method is intended for use with our `FromParams` logic, to construct a `TrainModel` object from a config file passed to the `allennlp train` command. The arguments to this method are the allowed top-level keys in a configuration file (except for the first three, which are obtained separately). You *could* use this outside of our `FromParams` logic if you really want to, but there might be easier ways to accomplish your goal than instantiating `Lazy` objects. If you are writing your own training loop, we recommend that you look at the implementation of this method for inspiration and possibly some utility functions you can call, but you very likely should not use this method directly. The `Lazy` type annotations here are a mechanism for building dependencies to an object sequentially - the `TrainModel` object needs data, a model, and a trainer, but the model needs to see the data before it's constructed (to create a vocabulary) and the trainer needs the data and the model before it's constructed. Objects that have sequential dependencies like this are labeled as `Lazy` in their type annotations, and we pass the missing dependencies when we call their `construct()` method, which you can see in the code below. # Parameters serialization_dir: `str` The directory where logs and model archives will be saved. local_rank: `int` The process index that is initialized using the GPU device id. batch_weight_key: `str` The name of metric used to weight the loss on a per-batch basis. dataset_reader: `DatasetReader` The `DatasetReader` that will be used for training and (by default) for validation. train_data_path: `str` The file (or directory) that will be passed to `dataset_reader.read()` to construct the training data. model: `Lazy[Model]` The model that we will train. This is lazy because it depends on the `Vocabulary`; after constructing the vocabulary we call `model.construct(vocab=vocabulary)`. iterator: `DataIterator` The iterator we use to batch instances from the dataset reader at training and (by default) validation time. trainer: `Lazy[TrainerBase]` The `Trainer` that actually implements the training loop. This is a lazy object because it depends on the model that's going to be trained. vocabulary: `Lazy[Vocabulary]`, optional (default=None) The `Vocabulary` that we will use to convert strings in the data to integer ids (and possibly set sizes of embedding matrices in the `Model`). By default we construct the vocabulary from the instances that we read. datasets_for_vocab_creation: `List[str]`, optional (default=None) If you pass in more than one dataset but don't want to use all of them to construct a vocabulary, you can pass in this key to limit it. Valid entries in the list are "train", "validation" and "test". validation_dataset_reader: `DatasetReader`, optional (default=None) If given, we will use this dataset reader for the validation data instead of `dataset_reader`. validation_data_path: `str`, optional (default=None) If given, we will use this data for computing validation metrics and early stopping. validation_iterator: `DataIterator`, optional (default=None) If given, we will use this iterator for batching and scheduling instances for the validation data, instead of `iterator`. test_data_path: `str`, optional (default=None) If given, we will use this as test data. This makes it available for vocab creation by default, but nothing else. evaluate_on_test: `bool`, optional (default=False) If given, we will evaluate the final model on this data at the end of training. Note that we do not recommend using this for actual test data in every-day experimentation; you should only very rarely evaluate your model on actual test data. """ datasets = training_util.read_all_datasets( train_data_path=train_data_path, dataset_reader=dataset_reader, validation_dataset_reader=validation_dataset_reader, validation_data_path=validation_data_path, test_data_path=test_data_path, ) if datasets_for_vocab_creation: for key in datasets_for_vocab_creation: if key not in datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {key}") instance_generator = (instance for key, dataset in datasets.items() if not datasets_for_vocab_creation or key in datasets_for_vocab_creation for instance in dataset) vocabulary_ = vocabulary.construct(instances=instance_generator) if not vocabulary_: vocabulary_ = Vocabulary.from_instances(instance_generator) model_ = model.construct(vocab=vocabulary_) # Initializing the model can have side effect of expanding the vocabulary. # Save the vocab only in the master. In the degenerate non-distributed # case, we're trivially the master. if common_util.is_master(): vocabulary_path = os.path.join(serialization_dir, "vocabulary") vocabulary_.save_to_files(vocabulary_path) iterator.index_with(model_.vocab) validation_iterator = validation_iterator or iterator validation_iterator.index_with( model_.vocab) # it is ok to call this twice # We don't need to pass serialization_dir and local_rank here, because they will have been # passed through the trainer by from_params already, because they were keyword arguments to # construct this class in the first place. trainer_ = trainer.construct( model=model_, iterator=iterator, train_data=datasets["train"], validation_iterator=validation_iterator, validation_data=datasets.get("validation"), ) return cls( serialization_dir=serialization_dir, model=model_, trainer=trainer_, evaluation_dataset=datasets.get("test"), evaluation_iterator=validation_iterator, evaluate_on_test=evaluate_on_test, batch_weight_key=batch_weight_key, )
def predict(archive_folder, span_file, cluster_file, output_file, cuda_device): combine_span_and_cluster_file(span_file, cluster_file) test_file = 'tmp_relation_42424242.jsonl' relation_threshold = json.load( open(archive_folder + '/metrics.json'))['test__n_ary_rel_global_threshold'] print(relation_threshold) import_submodules("scirex") logging.info("Loading Model from %s", archive_folder) archive_file = os.path.join(archive_folder, "model.tar.gz") archive = load_archive(archive_file, cuda_device) model = archive.model model.eval() model.prediction_mode = True config = archive.config.duplicate() dataset_reader_params = config["dataset_reader"] dataset_reader = DatasetReader.from_params(dataset_reader_params) dataset_reader.prediction_mode = True instances = dataset_reader.read(test_file) for instance in instances: batch = Batch([instance]) batch.index_instances(model.vocab) data_iterator = DataIterator.from_params(config["validation_iterator"]) iterator = data_iterator(instances, num_epochs=1, shuffle=False) with open(output_file, "w") as f: documents = {} for batch in tqdm(iterator): with torch.no_grad(): batch = nn_util.move_to_device(batch, cuda_device) output_res = model.decode_relations(batch) n_ary_relations = output_res['n_ary_relation'] predicted_relations, scores = n_ary_relations[ 'candidates'], n_ary_relations['scores'] if 'metadata' not in output_res['n_ary_relation']: continue metadata = output_res['n_ary_relation']['metadata'][0] doc_id = metadata['doc_id'] coref_key_map = { k: i for i, k in metadata['document_metadata'] ['cluster_name_to_id'].items() } for i, rel in enumerate(predicted_relations): predicted_relations[i] = tuple([ coref_key_map[k] if k in coref_key_map else None for k in rel ]) if doc_id not in documents: documents[doc_id] = { 'predicted_relations': [], 'doc_id': doc_id } label = [ 1 if x > relation_threshold else 0 for x in list(scores.ravel()) ] scores = [round(float(x), 4) for x in list(scores.ravel())] documents[doc_id]['predicted_relations'] += list( zip(predicted_relations, scores, label)) for d in documents.values(): predicted_relations = {} for r, s, l in d['predicted_relations']: r = tuple(r) if r not in predicted_relations or predicted_relations[r][ 0] < s: predicted_relations[r] = (s, l) d['predicted_relations'] = [ (r, s, l) for r, (s, l) in predicted_relations.items() ] f.write("\n".join([json.dumps(x) for x in documents.values()]))