Ejemplo n.º 1
0
def run_config(config):
    params = Params(json.loads(config))
    params_copy = params.duplicate()

    if "dataset_reader" in params:
        reader = DatasetReader.from_params(params.pop("dataset_reader"))
    else:
        raise RuntimeError("`dataset_reader` section is required")

    loader_params = params.pop("iterator")
    train_data_loader = DataIterator.from_params(
        reader=reader,
        data_path=params.pop("train_data_path"),
        params=loader_params.duplicate(),
    )
    dev_data_loader = DataIterator.from_params(
        reader=reader,
        data_path=params.pop("validation_data_path"),
        params=loader_params,
    )

    print("Building the vocabulary...")
    vocab = Vocabulary.from_instances(train_data_loader.iter_instances())

    if "model" not in params:
        # 'dataset' mode — just preview the (first 10) instances
        print("Showing the first 10 instances:")
        for inst in train_data_loader.iter_instances():
            print(inst)
            return None

    model = Model.from_params(vocab=vocab, params=params.pop("model"))

    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    # set up a temporary, empty directory for serialization
    with tempfile.TemporaryDirectory() as serialization_dir:
        trainer = Trainer.from_params(
            model=model,
            serialization_dir=serialization_dir,
            data_loader=train_data_loader,
            validation_data_loader=dev_data_loader,
            params=params.pop("trainer"),
        )
        trainer.train()

    return {
        "params": params_copy,
        "dataset_reader": reader,
        "vocab": vocab,
        "model": model,
    }
Ejemplo n.º 2
0
    def test_ultra_fine_reader(self):
        reader = get_reader("entity")
        instances = ensure_list(
            reader.read('tests/fixtures/evaluation/ultra_fine/train.json'))

        # Check number of instances is correct
        self.assertEqual(len(instances), 2)

        # Check that first instance's tokens are correct
        tokens_0 = [x.text for x in instances[0]['tokens']]
        segments_0 = list(instances[0]['segment_ids'].array)
        actual = list(zip(tokens_0, segments_0))
        expected = [('[CLS]', 0), ('the', 0), ('british', 0),
                    ('information', 0), ('commissioner', 0), ("'s", 0),
                    ('office', 0), ('invites', 0), ('[unused0]', 0), ('to', 0),
                    ('locate', 0), ('its', 0), ('add', 0), ('##ress', 0),
                    ('using', 0), ('google', 0), ('[UNK]', 0), ('.', 0),
                    ('[SEP]', 0), ('web', 1), ('users', 1), ('[SEP]', 1)]
        self.assertListEqual(actual, expected)

        iterator = DataIterator.from_params(Params({"type": "basic"}))
        iterator.index_with(Vocabulary())

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        expected_labels = [[0, 0, 0, 0, 0, 0, 1, 0, 0],
                           [1, 0, 0, 0, 0, 0, 0, 0, 0]]
        self.assertEqual(batch['label_ids'].numpy().tolist(), expected_labels)
Ejemplo n.º 3
0
def write_for_official_eval(model_archive_file, test_file, output_file,
                            label_ids_to_label):
    archive = load_archive(model_archive_file)
    model = archive.model

    reader = DatasetReader.from_params(archive.config['dataset_reader'])

    iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 4}))
    vocab = Vocabulary.from_params(archive.config['vocabulary'])
    iterator.index_with(vocab)

    model.cuda()
    model.eval()

    instances = reader.read(test_file)
    predictions = []
    for batch in iterator(instances, num_epochs=1, shuffle=False):
        batch = move_to_device(batch, cuda_device=0)
        output = model(**batch)

        batch_labels = [
            label_ids_to_label[i]
            for i in output['predictions'].cpu().numpy().tolist()
        ]

        predictions.extend(batch_labels)


    with open(output_file, 'w') as fout:
        for p in predictions:
            fout.write("{}\n".format(p))
Ejemplo n.º 4
0
    def setUp(self):
        super().setUp()
        params = Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                },
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path":
            str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
            "validation_data_path":
            str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
            "iterator": {
                "type": "basic",
                "batch_size": 2
            },
            "trainer": {
                "cuda_device": -1,
                "num_epochs": 2,
                "optimizer": "adam"
            },
        })
        all_datasets = datasets_from_params(params)
        vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            instances=(instance for dataset in all_datasets.values()
                       for instance in dataset),
        )
        model = Model.from_params(vocab=vocab, params=params.pop("model"))
        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(vocab)
        train_data = all_datasets["train"]
        trainer_params = params.pop("trainer")
        serialization_dir = os.path.join(self.TEST_DIR,
                                         "test_search_learning_rate")

        self.trainer = TrainerBase.from_params(
            model=model,
            serialization_dir=serialization_dir,
            iterator=iterator,
            train_data=train_data,
            params=trainer_params,
            validation_data=None,
            validation_iterator=None,
        )
Ejemplo n.º 5
0
    def ensure_model_can_train_save_and_load(self, param_file: str):
        save_dir = os.path.join(self.TEST_DIR, "save_and_load_test")
        archive_file = os.path.join(save_dir, "model.tar.gz")
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].numpy(),
                            loaded_model.state_dict()[key].numpy(),
                            err_msg=key)
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        iterator = DataIterator.from_params(params['iterator'])

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        model_dataset.index_instances(model.vocab)
        model_batch_arrays = next(iterator(model_dataset, shuffle=False))
        model_batch = arrays_to_variables(model_batch_arrays, for_training=False)
        loaded_dataset = reader.read(params['validation_data_path'])
        loaded_dataset.index_instances(loaded_model.vocab)
        loaded_batch_arrays = next(iterator(loaded_dataset, shuffle=False))
        loaded_batch = arrays_to_variables(loaded_batch_arrays, for_training=False)

        # The datasets themselves should be identical.
        for key in model_batch.keys():
            field = model_batch[key]
            if isinstance(field, dict):
                for subfield in field:
                    self.assert_fields_equal(model_batch[key][subfield],
                                             loaded_batch[key][subfield],
                                             tolerance=1e-6,
                                             name=key + '.' + subfield)
            else:
                self.assert_fields_equal(model_batch[key], loaded_batch[key], 1e-6, key)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        model_predictions = model.forward(**model_batch)
        loaded_model_predictions = loaded_model.forward(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     tolerance=1e-4,
                                     name=key)

        return model, loaded_model
Ejemplo n.º 6
0
def main(args):
    params = Params.from_file(args.config_path)
    stdout_handler = prepare_global_logging(args.output_dir, False)
    prepare_environment(params)

    reader = DatasetReader.from_params(params["dataset_reader"])
    train_dataset = reader.read(params.pop("train_data_path", None))
    valid_dataset = reader.read(params.pop("validation_data_path", None))
    test_data_path = params.pop("test_data_path", None)
    if test_data_path:
        test_dataset = reader.read(test_data_path)
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset +
                                          test_dataset)
    else:
        test_dataset = None
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset)

    model_params = params.pop("model", None)
    model = Model.from_params(model_params.duplicate(), vocab=vocab)
    vocab.save_to_files(os.path.join(args.output_dir, "vocabulary"))
    # copy config file
    with open(args.config_path, "r", encoding="utf-8") as f_in:
        with open(os.path.join(args.output_dir, "config.json"),
                  "w",
                  encoding="utf-8") as f_out:
            f_out.write(f_in.read())

    iterator = DataIterator.from_params(params.pop("iterator", None))
    iterator.index_with(vocab)

    trainer_params = params.pop("trainer", None)
    trainer = Trainer.from_params(model=model,
                                  serialization_dir=args.output_dir,
                                  iterator=iterator,
                                  train_data=train_dataset,
                                  validation_data=valid_dataset,
                                  params=trainer_params.duplicate())
    trainer.train()

    # evaluate on the test set
    if test_dataset:
        logging.info("Evaluating on the test set")
        import torch  # import here to ensure the republication of the experiment
        model.load_state_dict(
            torch.load(os.path.join(args.output_dir, "best.th")))
        test_metrics = evaluate(model,
                                test_dataset,
                                iterator,
                                cuda_device=trainer_params.pop(
                                    "cuda_device", 0),
                                batch_weight_key=None)
        logging.info(f"Metrics on the test set: {test_metrics}")
        with open(os.path.join(args.output_dir, "test_metrics.txt"),
                  "w",
                  encoding="utf-8") as f_out:
            f_out.write(f"Metrics on the test set: {test_metrics}")

    cleanup_global_logging(stdout_handler)
Ejemplo n.º 7
0
def main(config_file):
    config = Params.from_file(config_file)
    dataset_reader = DatasetReader.from_params(config['dataset_reader'])
    iterator_params = config['iterator']
    iterator_keys = list(iterator_params.keys())
    for key in iterator_keys:
        if key != 'batch_size':
            del iterator_params[key]
    iterator_params['type'] = 'basic'
    iterator = DataIterator.from_params(iterator_params)
    evaluation_data_path = config['validation_data_path']

    expected_version = '1.1'
    with open(evaluation_data_path) as dataset_file:
        dataset_json = json.load(dataset_file)
        if (dataset_json['version'] != expected_version):
            print('Evaluation expects v-' + expected_version +
                  ', but got dataset with v-' + dataset_json['version'],
                  file=sys.stderr)
        official_script_dataset = dataset_json['data']

    cuda_device = 0
    squad_eval.verbosity = 1
    model = Model.load(config, cuda_device=cuda_device)

    # Load the evaluation data
    print("Reading evaluation data from %s" % evaluation_data_path)
    dataset = dataset_reader.read(evaluation_data_path)
    dataset.index_instances(model._vocab)

    model.eval()
    generator = iterator(dataset, num_epochs=1, shuffle=False)
    print("Predicting best spans for the evaluation data")
    best_spans = []
    result_dict = {}
    for batch in tqdm.tqdm(generator):
        tensor_batch = arrays_to_variables(batch,
                                           cuda_device,
                                           for_training=False)
        result = model.forward(**tensor_batch)
        best_span_tensor = result['best_span']
        for i in range(best_span_tensor.size(0)):
            best_spans.append(best_span_tensor[i].data.cpu().tolist())
    for best_span, instance in zip(best_spans, dataset.instances):
        span_tokens = instance.fields['passage'].tokens[
            best_span[0]:best_span[1]]
        # We have to do some hacks to get from our tokens back to the original passage text, so
        # that our answers get scored correctly.  This could be made much easier if we kept around
        # the character offset in the original text when we tokenize things.
        span_text = fix_span_text(span_tokens,
                                  instance.metadata['original_passage'])
        question_id = instance.metadata['id']
        result_dict[question_id] = span_text
    metrics = model.get_metrics()
    official_result = squad_eval.evaluate(official_script_dataset, result_dict)
    print("Our model's metrics:", metrics)
    print("Official result:", official_result)
Ejemplo n.º 8
0
    def test_wic_reader_entity_markers(self):
        reader_params = Params({
            "type": "wic",
            "entity_markers": True,
            "tokenizer_and_candidate_generator": {
                "type": "bert_tokenizer_and_candidate_generator",
                "entity_candidate_generators": {
                    "wordnet": {
                        "type":
                        "wordnet_mention_generator",
                        "entity_file":
                        "tests/fixtures/wordnet/entities_fixture.jsonl"
                    }
                },
                "entity_indexers": {
                    "wordnet": {
                        "type": "characters_tokenizer",
                        "tokenizer": {
                            "type": "word",
                            "word_splitter": {
                                "type": "just_spaces"
                            },
                        },
                        "namespace": "entity"
                    }
                },
                "bert_model_type":
                "tests/fixtures/evaluation/wic/vocab_entity_markers.txt",
                "do_lower_case": True,
            },
        })

        reader = DatasetReader.from_params(reader_params)
        instances = reader.read(FIXTURES + '/train')
        iterator = DataIterator.from_params(Params({"type": "basic"}))
        iterator.index_with(Vocabulary())

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        self.assertTrue(len(batch['label_ids']) == 5)

        self.assertEqual(batch['index_a'][0].item(), 3)
        self.assertEqual(batch['index_b'][0].item(), 12)

        instance_0_text = [
            token.text for token in instances[0].fields['tokens'].tokens
        ]
        expected_instance_0_text = [
            '[CLS]', '[UNK]', '[UNK]', '[e1start]', '[UNK]', '[e1end]',
            '[UNK]', '[UNK]', '[UNK]', '.', '[SEP]', '[UNK]', '[e2start]',
            '[UNK]', '[e2end]', '[UNK]', 'over', '[UNK]', '.', '[SEP]'
        ]
        self.assertEqual(instance_0_text, expected_instance_0_text)
        self.assertEqual(instance_0_text[3], '[e1start]')
        self.assertEqual(instance_0_text[12], '[e2start]')
Ejemplo n.º 9
0
    def test_reader(self):
        reader = get_reader(masked_lm_prob=0.15)

        np.random.seed(5)
        instances = reader.read("tests/fixtures/bert_pretraining/shard1.txt")

        vocab = Vocabulary.from_params(Params({
            "directory_path": "tests/fixtures/bert/vocab_dir_with_entities_for_tokenizer_and_generator"
        }))
        iterator = DataIterator.from_params(Params({"type": "basic"}))
        iterator.index_with(vocab)

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        actual_tokens_ids = batch['tokens']['tokens']
        expected_tokens_ids = torch.tensor(
                [[16, 18, 19, 20,  1, 19, 21, 13, 17, 21,  3,  4, 12, 13, 17],
                [16,  1, 13, 17, 21,  1,  1, 13, 17,  0,  0,  0,  0,  0,  0]])

        self.assertEqual(actual_tokens_ids.tolist(), expected_tokens_ids.tolist())

        actual_entities = batch['candidates']['wordnet']['candidate_entities']['ids']
        expected_entities = torch.tensor(
                    [[[29, 30],
                     [31,  0],
                     [31,  0]],
                   
                    [[ 0,  0],
                     [ 0,  0],
                     [ 0,  0]]])
        self.assertEqual(actual_entities.tolist(), expected_entities.tolist())

        expected_spans = torch.tensor(
                       [[[ 1,  3],
                         [ 2,  3],
                         [ 5,  6]],
                
                        [[-1, -1],
                         [-1, -1],
                         [-1, -1]]])
        actual_spans = batch['candidates']['wordnet']['candidate_spans']
        self.assertEqual(actual_spans.tolist(), expected_spans.tolist())

        expected_lm_labels = torch.tensor(
                [[ 0,  0,  0,  0,  0,  0, 20,  0,  0,  2,  0,  0,  0,  0,  0],
                 [ 0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
        actual_lm_labels = batch['lm_label_ids']['lm_labels']
        self.assertEqual(actual_lm_labels.tolist(), expected_lm_labels.tolist())

        expected_segment_ids = torch.tensor(
            [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
             [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])
        self.assertEqual(batch['segment_ids'].tolist(), expected_segment_ids.tolist())
        self.assertTrue(batch['segment_ids'].dtype == torch.long)
Ejemplo n.º 10
0
def evaluate_from_args(args: argparse.Namespace):
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader', None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources: Dict[str, str] = (json.loads(args.embedding_sources_mapping)
                                         if args.embedding_sources_mapping else {})
    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(Params({}), instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    csv_writer = csv.writer(args.output_file)

    keys = None
    for instance in instances:
        metrics = evaluate(model, [instance], iterator, args.cuda_device, args.batch_weight_key)

        if keys is None:
            keys = sorted(metrics.keys())
            csv_writer.writerow(['instance_id', *keys])

        instance_id = instance.fields['metadata']['id']

        values = [metrics[key] for key in keys]
        csv_writer.writerow([instance_id, *values])
Ejemplo n.º 11
0
    def test_sample(self):
        generator_params = Params.from_file(
            "kglm/tests/fixtures/training_config/kglm.json")
        params = Params.from_file(self.param_file)
        dataset_file = "kglm/tests/fixtures/enhanced-wikitext-test/train.jsonl"

        # Need instances from 'generative' reader!
        reader_params = generator_params['dataset_reader']
        reader = DatasetReader.from_params(reader_params)
        instances = list(reader.read(dataset_file))
        iterator = DataIterator.from_params(generator_params['iterator'])
        iterator.index_with(self.model.vocab)
        batch, _ = next(iterator(instances, shuffle=False))
        self.model.sample(**batch)
Ejemplo n.º 12
0
    def setUp(self):
        super().setUp()
        params = Params({
                "model": {
                    "type": "simple_tagger",
                    "text_field_embedder": {
                        "token_embedders": {
                            "tokens": {
                                "type": "embedding",
                                "embedding_dim": 5
                            }
                        }
                    },
                    "encoder": {
                        "type": "lstm",
                        "input_size": 5,
                        "hidden_size": 7,
                        "num_layers": 2
                    }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                    "cuda_device": -1,
                    "num_epochs": 2,
                    "optimizer": "adam"
                }
            })
        all_datasets = datasets_from_params(params)
        vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for dataset in all_datasets.values()
             for instance in dataset)
        )
        model = Model.from_params(vocab=vocab, params=params.pop('model'))
        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(vocab)
        train_data = all_datasets['train']
        trainer_params = params.pop("trainer")
        serialization_dir = os.path.join(self.TEST_DIR, 'test_search_learning_rate')

        self.trainer = Trainer.from_params(model,
                                           serialization_dir,
                                           iterator,
                                           train_data,
                                           params=trainer_params,
                                           validation_data=None,
                                           validation_iterator=None)
Ejemplo n.º 13
0
def run_evaluation(evaluation_file, model_archive, random_candidates=False):

    archive = load_archive(model_archive)
    model = archive.model
    vocab = model.vocab
    params = archive.config

    model.multitask = False
    model.multitask_kg = False
    model.cuda()
    model.eval()
    for p in model.parameters():
        p.requires_grad_(False)

    reader_params = params.pop('dataset_reader')
    if reader_params['type'] == 'multitask_reader':
        reader_params = reader_params['dataset_readers']['language_modeling']

    if random_candidates:
        for k, v in reader_params['base_reader'][
                'tokenizer_and_candidate_generator'][
                    'entity_candidate_generators'].items():
            v['random_candidates'] = True

    reader = DatasetReader.from_params(Params(reader_params))

    iterator = DataIterator.from_params(
        Params({
            "type": "self_attn_bucket",
            "batch_size_schedule": "base-11gb-fp32",
            "iterator": {
                "type": "bucket",
                "batch_size": 32,
                "sorting_keys": [["tokens", "num_tokens"]],
                "max_instances_in_memory": 2500,
            }
        }))
    iterator.index_with(vocab)
    instances = reader.read(evaluation_file)

    for batch_no, batch in enumerate(
            tqdm.tqdm(iterator(instances, num_epochs=1))):
        b = move_to_device(batch, 0)
        loss = model(**b)
        if batch_no % 100 == 0:
            print(model.get_metrics())

    print(model.get_metrics())
Ejemplo n.º 14
0
    def __init__(self,
                 model_archive,
                 batch_size=32,
                 masking_strategy=None,
                 wordnet_entity_file=None,
                 vocab_dir=None):

        # get bert_tokenizer_and_candidate_generator
        if os.path.isdir(model_archive):
            config = Params.from_file(
                os.path.join(model_archive, 'config.json'))
        else:
            config = _extract_config_from_archive(cached_path(model_archive))

        # look for the bert_tokenizers and candidate_generator
        candidate_generator_params = _find_key(
            config['dataset_reader'].as_dict(),
            'tokenizer_and_candidate_generator')

        if wordnet_entity_file is not None:
            candidate_generator_params['entity_candidate_generators'][
                'wordnet']['entity_file'] = wordnet_entity_file

        self.tokenizer_and_candidate_generator = TokenizerAndCandidateGenerator.\
                from_params(Params(candidate_generator_params))
        self.tokenizer_and_candidate_generator.whitespace_tokenize = False

        assert masking_strategy is None or masking_strategy == 'full_mask'
        self.masking_strategy = masking_strategy

        # need bert_tokenizer_and_candidate_generator
        if vocab_dir is not None:
            vocab_params = Params({"directory_path": vocab_dir})
        else:
            vocab_params = config['vocabulary']
        self.vocab = Vocabulary.from_params(vocab_params)

        self.iterator = DataIterator.from_params(
            Params({
                "type": "basic",
                "batch_size": batch_size
            }))
        self.iterator.index_with(self.vocab)
Ejemplo n.º 15
0
    def test_wic_reader(self):
        reader_params = Params({
            "type": "wic",
            "tokenizer_and_candidate_generator": {
                "type": "bert_tokenizer_and_candidate_generator",
                "entity_candidate_generators": {
                    "wordnet": {
                        "type":
                        "wordnet_mention_generator",
                        "entity_file":
                        "tests/fixtures/wordnet/entities_fixture.jsonl"
                    }
                },
                "entity_indexers": {
                    "wordnet": {
                        "type": "characters_tokenizer",
                        "tokenizer": {
                            "type": "word",
                            "word_splitter": {
                                "type": "just_spaces"
                            },
                        },
                        "namespace": "entity"
                    }
                },
                "bert_model_type": "tests/fixtures/bert/vocab.txt",
                "do_lower_case": True,
            },
        })

        reader = DatasetReader.from_params(reader_params)
        instances = reader.read(FIXTURES + '/train')
        iterator = DataIterator.from_params(Params({"type": "basic"}))
        iterator.index_with(Vocabulary())

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        self.assertTrue(len(batch['label_ids']) == 5)

        self.assertEqual(batch['index_a'][0].item(), 3)
        self.assertEqual(batch['index_b'][0].item(), 10)
Ejemplo n.º 16
0
Archivo: testing.py Proyecto: zxlzr/kb
def get_wsd_reader(is_training,
                   use_bert_indexer=False,
                   wordnet_entity_file=None):
    if wordnet_entity_file is None:
        wordnet_entity_file = "tests/fixtures/wordnet/entities_cat_hat.jsonl"

    if use_bert_indexer:
        bert_fixtures = get_bert_test_fixture()
        indexer_params = bert_fixtures["indexer_params"]
    else:
        indexer_params = {"type": "single_id", "lowercase_tokens": True}

    reader_params = {
        "type": "wordnet_fine_grained",
        "wordnet_entity_file": wordnet_entity_file,
        "token_indexers": {
            "tokens": indexer_params,
        },
        "entity_indexer": {
            "type": "characters_tokenizer",
            "tokenizer": {
                "type": "word",
                "word_splitter": {
                    "type": "just_spaces"
                },
            },
            "namespace": "entity"
        },
        "is_training": is_training,
        "use_surface_form": False
    }
    reader = DatasetReader.from_params(Params(reader_params))

    vocab_params = {
        "directory_path": "tests/fixtures/wordnet/cat_hat_vocabdir"
    }
    vocab = Vocabulary.from_params(Params(vocab_params))

    iterator = DataIterator.from_params(Params({"type": "basic"}))
    iterator.index_with(vocab)

    return reader, vocab, iterator
Ejemplo n.º 17
0
    def test_sample(self):
        generator_params = Params.from_file(
            "kglm/tests/fixtures/training_config/kglm.no-shortlist.json")
        params = Params.from_file(self.param_file)
        dataset_file = "kglm/tests/fixtures/enhanced-wikitext-test/train.jsonl"

        # Need instances from 'generative' reader!
        reader_params = generator_params['dataset_reader']
        reader_params['mode'] = 'generative'
        reader = DatasetReader.from_params(reader_params)
        instances = list(reader.read(dataset_file))

        iterator = DataIterator.from_params(generator_params['iterator'])
        iterator.index_with(self.model.vocab)
        batch, _ = next(iterator(instances, shuffle=False))

        # Samples should match (we'll test by comparing logp)
        torch.manual_seed(123)
        logp1 = self.model.sample(**batch).get('logp', None)
        torch.manual_seed(123)
        logp2 = self.model.sample(**batch).get('logp', None)
Ejemplo n.º 18
0
def get_wic_batch():
    fixtures = 'tests/fixtures/evaluation/wic'

    reader_params = Params({
        "type": "wic",
        "tokenizer_and_candidate_generator": {
            "type": "bert_tokenizer_and_candidate_generator",
            "entity_candidate_generators": {
                "wordnet": {
                    "type": "wordnet_mention_generator",
                    "entity_file":
                    "tests/fixtures/wordnet/entities_fixture.jsonl"
                }
            },
            "entity_indexers": {
                "wordnet": {
                    "type": "characters_tokenizer",
                    "tokenizer": {
                        "type": "word",
                        "word_splitter": {
                            "type": "just_spaces"
                        },
                    },
                    "namespace": "entity"
                }
            },
            "bert_model_type": "tests/fixtures/bert/vocab.txt",
            "do_lower_case": True,
        },
    })

    reader = DatasetReader.from_params(reader_params)
    instances = reader.read(fixtures + '/train')
    iterator = DataIterator.from_params(Params({"type": "basic"}))
    iterator.index_with(Vocabulary())

    for batch in iterator(instances, num_epochs=1, shuffle=False):
        break

    return batch
Ejemplo n.º 19
0
def knowbert_fill2(sentences, model, batcher, vocab, mask_start=0, mask_end=0, config_file=None, top=10):
    iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 32}))
    config = Params.from_file(config_file)
    vocab_params = config['vocabulary']
    iterator.index_with(Vocabulary.from_params(vocab_params))
    instances = []
    for sent in sentences:
        token_candidates = batcher.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sent.replace('[MASK]', ' [MASK] '))
        masked_tokens = token_candidates['tokens'].copy()
        for i in range(mask_start, mask_end):
            masked_tokens[i] = '[MASK]'
        token_candidates['tokens'] = masked_tokens

        # mask out the entity candidates
        candidates = token_candidates['candidates']
        for candidate_key in candidates.keys():
            indices_to_mask = []
            for k, candidate_span in enumerate(candidates[candidate_key]['candidate_spans']):
                if (candidate_span[0] >= mask_start and candidate_span[0] <= mask_end-1) or (candidate_span[1] >= mask_start and candidate_span[1] <= mask_end-1):
                    indices_to_mask.append(k)
            for ind in indices_to_mask:
                candidates[candidate_key]['candidate_entities'][ind] = ['@@MASK@@']
                candidates[candidate_key]['candidate_entity_priors'][ind] = [1.0]
            if len(indices_to_mask) == 0:
                candidates[candidate_key]['candidate_spans'].append([mask_start, mask_end-1])
                candidates[candidate_key]['candidate_entities'].append(['@@MASK@@'])
                candidates[candidate_key]['candidate_entity_priors'].append([1.0])
                candidates[candidate_key]['candidate_segment_ids'].append(0)
        fields = batcher.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates)
        instances.append(Instance(fields))
    for batch in iterator(instances, num_epochs=1, shuffle=False):
        print(batch['tokens']['tokens'])
        model_output = model(**batch)
        print([vocab[w] for w in batch['tokens']['tokens'][0].numpy()])
        logits, _ = model.pretraining_heads(model_output['contextual_embeddings'], model_output['pooled_output'])
        log_probs = F.log_softmax(logits, dim=-1).cpu()
        for mask_ind in range(mask_start, mask_end):
            topk = torch.topk(log_probs[0, mask_ind], top, -1)[1]
            print([vocab[t.item()] for t in topk])
Ejemplo n.º 20
0
    def ensure_model_can_train_save_and_load(
            self,
            param_file: str,
            tolerance: float = 1e-4,
            cuda_device: int = -1,
            gradients_to_ignore: Set[str] = None,
            overrides: str = ""):
        """
        Parameters
        ----------
        param_file : ``str``
            Path to a training configuration file that we will use to train the model for this
            test.
        tolerance : ``float``, optional (default=1e-4)
            When comparing model predictions between the originally-trained model and the model
            after saving and loading, we will use this tolerance value (passed as ``rtol`` to
            ``numpy.testing.assert_allclose``).
        cuda_device : ``int``, optional (default=-1)
            The device to run the test on.
        gradients_to_ignore : ``Set[str]``, optional (default=None)
            This test runs a gradient check to make sure that we're actually computing gradients
            for all of the parameters in the model.  If you really want to ignore certain
            parameters when doing that check, you can pass their names here.  This is not
            recommended unless you're `really` sure you don't need to have non-zero gradients for
            those parameters (e.g., some of the beam search / state machine models have
            infrequently-used parameters that are hard to force the model to use in a small test).
        overrides : ``str``, optional (default = "")
            A JSON string that we will use to override values in the input parameter file.
        """
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file,
                                      save_dir,
                                      overrides=overrides)
        loaded_model = load_archive(archive_file,
                                    cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(param_file)

        # Need to duplicate params because DatasetReader.from_params will consume.
        reader_params = params['dataset_reader']
        reader_params2 = Params(copy.deepcopy(reader_params.as_dict()))

        reader = DatasetReader.from_params(reader_params)
        reader2 = DatasetReader.from_params(reader_params2)

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        seed_params = Params({
            "random_seed": 5,
            "numpy_seed": 5,
            "pytorch_seed": 5
        })
        prepare_environment(seed_params)
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False))

        seed_params = Params({
            "random_seed": 5,
            "numpy_seed": 5,
            "pytorch_seed": 5
        })
        prepare_environment(seed_params)
        loaded_dataset = reader2.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch,
                                                      gradients_to_ignore)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        # import pdb; pdb.set_trace()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key,
                                     1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
Ejemplo n.º 21
0
def run_evaluation(evaluation_file,
                   model_archive_file,
                   is_wordnet_and_wiki=False):
    archive = load_archive(model_archive_file)

    params = archive.config
    vocab = Vocabulary.from_params(params.pop('vocabulary'))

    model = archive.model
    #model.cuda()
    model.eval()

    if is_wordnet_and_wiki:
        reader_params = Params({
            "type": "aida_wiki_linking",
            "entity_disambiguation_only": False,
            "entity_indexer": {
                "type": "characters_tokenizer",
                "namespace": "entity_wiki",
                "tokenizer": {
                    "type": "word",
                    "word_splitter": {
                        "type": "just_spaces"
                    }
                }
            },
            "extra_candidate_generators": {
                "wordnet": {
                    "type": "wordnet_mention_generator",
                    "entity_file":
                    "s3://allennlp/knowbert/wordnet/entities.jsonl"
                }
            },
            "should_remap_span_indices": True,
            "token_indexers": {
                "tokens": {
                    "type": "bert-pretrained",
                    "do_lowercase": True,
                    "max_pieces": 512,
                    "pretrained_model": "bert-base-uncased",
                    "use_starting_offsets": True,
                }
            }
        })
    else:
        reader_params = Params({
            "type": "aida_wiki_linking",
            "entity_disambiguation_only": False,
            "token_indexers": {
                "tokens": {
                    "type": "bert-pretrained",
                    "pretrained_model": "bert-base-uncased",
                    "do_lowercase": True,
                    "use_starting_offsets": True,
                    "max_pieces": 512,
                },
            },
            "entity_indexer": {
                "type": "characters_tokenizer",
                "tokenizer": {
                    "type": "word",
                    "word_splitter": {
                        "type": "just_spaces"
                    },
                },
                "namespace": "entity",
            },
            "should_remap_span_indices": True,
        })

    if is_wordnet_and_wiki:
        cg_params = Params({
            "type": "bert_tokenizer_and_candidate_generator",
            "bert_model_type": "bert-base-uncased",
            "do_lower_case": True,
            "entity_candidate_generators": {
                "wordnet": {
                    "type": "wordnet_mention_generator",
                    "entity_file":
                    "s3://allennlp/knowbert/wordnet/entities.jsonl"
                }
            },
            "entity_indexers": {
                "wordnet": {
                    "type": "characters_tokenizer",
                    "namespace": "entity_wordnet",
                    "tokenizer": {
                        "type": "word",
                        "word_splitter": {
                            "type": "just_spaces"
                        }
                    }
                }
            }
        })
        candidate_generator = TokenizerAndCandidateGenerator.from_params(
            cg_params)

    reader = DatasetReader.from_params(Params(reader_params))

    iterator = DataIterator.from_params(
        Params({
            "type": "basic",
            "batch_size": 16
        }))
    iterator.index_with(vocab)

    instances = reader.read(evaluation_file)

    for batch_no, batch in enumerate(
            iterator(instances, shuffle=False, num_epochs=1)):
        b = move_to_device(batch, -1)

        b['candidates'] = {
            'wiki': {
                'candidate_entities': b.pop('candidate_entities'),
                'candidate_entity_priors': b.pop('candidate_entity_prior'),
                'candidate_segment_ids': b.pop('candidate_segment_ids'),
                'candidate_spans': b.pop('candidate_spans')
            }
        }
        gold_entities = b.pop('gold_entities')
        b['gold_entities'] = {'wiki': gold_entities}

        if is_wordnet_and_wiki:
            extra_candidates = b.pop('extra_candidates')
            seq_len = b['tokens']['tokens'].shape[1]
            bbb = []
            for e in extra_candidates:
                for k in e.keys():
                    e[k]['candidate_segment_ids'] = [0] * len(
                        e[k]['candidate_spans'])
                ee = {
                    'tokens': ['[CLS]'] * seq_len,
                    'segment_ids': [0] * seq_len,
                    'candidates': e
                }
                ee_fields = candidate_generator.convert_tokens_candidates_to_fields(
                    ee)
                bbb.append(Instance(ee_fields))
            eb = Batch(bbb)
            eb.index_instances(vocab)
            padding_lengths = eb.get_padding_lengths()
            tensor_dict = eb.as_tensor_dict(padding_lengths)
            b['candidates'].update(tensor_dict['candidates'])
            bb = move_to_device(b, -1)
        else:
            bb = b

        loss = model(**bb)
        if batch_no % 100 == 0:
            print(model.get_metrics())

    print(model.get_metrics())
Ejemplo n.º 22
0
    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1):
        save_dir = os.path.join(self.TEST_DIR, "save_and_load_test")
        archive_file = os.path.join(save_dir, "model.tar.gz")
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
Ejemplo n.º 23
0
def predict(archive_folder, span_file, cluster_file, output_file, cuda_device):
    combine_span_and_cluster_file(span_file, cluster_file)

    test_file = 'tmp_relation_42424242.jsonl'
    relation_threshold = json.load(
        open(archive_folder +
             '/metrics.json'))['best_validation__n_ary_rel_global_threshold']
    print(relation_threshold)

    import_submodules("scirex")
    logging.info("Loading Model from %s", archive_folder)
    archive_file = os.path.join(archive_folder, "model.tar.gz")
    archive = load_archive(archive_file, cuda_device)
    model = archive.model
    model.eval()

    model.prediction_mode = True
    config = archive.config.duplicate()
    dataset_reader_params = config["dataset_reader"]
    dataset_reader = DatasetReader.from_params(dataset_reader_params)
    dataset_reader.prediction_mode = True
    instances = dataset_reader.read(test_file)

    for instance in instances:
        batch = Batch([instance])
        batch.index_instances(model.vocab)

    data_iterator = DataIterator.from_params(config["validation_iterator"])
    iterator = data_iterator(instances, num_epochs=1, shuffle=False)

    with open(output_file, "w") as f:
        documents = {}
        for batch in tqdm(iterator):
            with torch.no_grad():
                batch = nn_util.move_to_device(batch, cuda_device)
                output_res = model.decode_relations(batch)

            n_ary_relations = output_res['n_ary_relation']
            predicted_relations, scores = n_ary_relations[
                'candidates'], n_ary_relations['scores']
            try:
                metadata = output_res['n_ary_relation']['metadata'][0]
            except (KeyError, IndexError):
                continue
            doc_id = metadata['doc_id']
            coref_key_map = {
                k: i
                for i, k in metadata['document_metadata']
                ['cluster_name_to_id'].items()
            }

            for i, rel in enumerate(predicted_relations):
                predicted_relations[i] = tuple([
                    coref_key_map[k] if k in coref_key_map else None
                    for k in rel
                ])

            if doc_id not in documents:
                documents[doc_id] = {
                    'predicted_relations': [],
                    'doc_id': doc_id
                }
            scores_ = list(scores.ravel())
            if not scores_:
                warnings.warn(f"no relation scores defined for {doc_id}")
                continue
            label = [1 if x > relation_threshold else 0 for x in scores_]
            if all(l == 0 for l in label):
                decoding_mode = os.environ.get("SCIREX_RELATION_DECODING")
                if decoding_mode == "report_single_most_likely":
                    label[scores.argmax()] = 1
                elif decoding_mode == "report_probabilistically":
                    idxs_sorted_by_score = sorted(
                        range(len(label)),
                        key=lambda i: scores[i],
                        reverse=True  # highest score first
                    )
                    possible_decoding_idxs = \
                        [idxs_sorted_by_score[:i] for i in range(1, 11)]  # assuming that >10 relationships would never happen

                    def score_decoding(candidate_idxs):
                        """likelihood function for a geometric distribution fit
                        to the training distribution of number-of-relationships-per-document

                        :param candidate_idxs (List[int]): a list of idxs that represents a relationship distribution
                        :return: likelihood that distribution
                        """
                        score_from_n_relationships = st.geom.pmf(
                            len(candidate_idxs),
                            0.4046692607003891  # MLE from training distribution, i.e.: 1 / (1 + E[X])
                        )
                        score_from_indiv_relationships = scores[candidate_idxs]
                        return score_from_n_relationships * np.prod(
                            score_from_indiv_relationships)

                    best_decoding_idxs = max(possible_decoding_idxs,
                                             key=score_decoding)
                    for idx in best_decoding_idxs:
                        label[idx] = 1

            scores = [round(float(x), 4) for x in list(scores.ravel())]
            documents[doc_id]['predicted_relations'] += list(
                zip(predicted_relations, scores, label))

        for d in documents.values():
            predicted_relations = {}
            for r, s, l in d['predicted_relations']:
                r = tuple(r)
                if r not in predicted_relations or predicted_relations[r][
                        0] < s:
                    predicted_relations[r] = (s, l)

            d['predicted_relations'] = [
                (r, s, l) for r, (s, l) in predicted_relations.items()
            ]

        f.write("\n".join([json.dumps(x) for x in documents.values()]))
Ejemplo n.º 24
0
def train_model(params: Union[Params, Dict[str, Any]], cuda_device: int,
                serialization_dir: str, filtering: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """

    SimpleRandom.set_seeds()

    os.makedirs(serialization_dir, exist_ok=True)
    try:
        sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                               sys.stdout, True)  # type: ignore
        sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                               sys.stderr, True)  # type: ignore
    except TypeError:
        sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                               sys.stdout)  # type: ignore
        sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                               sys.stderr)  # type: ignore
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)

    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    ds_params = params.pop('dataset_reader', {})
    read_settings = ds_params.pop('read_settings', {})
    dataset_reader = FEVERReader.from_params(ds_params)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(
        train_data_path,
        include_metadata=True,
        replace_with_gold=read_settings.pop('replace_gold', False),
        pad_with_nearest=read_settings.pop('pad_with_nearest', 0))

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path,
                                              include_metadata=True)
    else:
        validation_data = None

    vocab_params = params.pop("vocabulary", {})
    dataset = None
    print(dict(vocab_params), 'directory_path' not in vocab_params)
    assert ('directory_path' in vocab_params)
    vocab = Vocabulary.from_params(vocab_params, dataset)
    print(vocab)
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    trainer_params = params.pop("trainer")
    if cuda_device is not None:
        trainer_params["cuda_device"] = cuda_device
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model
Ejemplo n.º 25
0
def forward_on_instances(
        model, instances: Iterable[Instance],
        data_iterator: DataIterator) -> List[Dict[str, numpy.ndarray]]:
    """
    Basically a copy of Model.forward_on_instances, but also takes a DataIterator in order to be more efficient.


    Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
    arrays using this model's :class:`Vocabulary`, passes those arrays through
    :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
    and returns the result.  Before returning the result, we convert any
    ``torch.Tensors`` into numpy arrays and separate the
    batched output into a list of individual dicts per instance. Note that typically
    this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
    :func:`forward_on_instance`.

    Parameters
    ----------
    model : AllenNLP model, required
        The model to run.
    instances : List[Instance], required
        The instances to run the model on.
    data_iterator: DataIterator, required
        The DataIterator used for going over the data (e.g. BucketIterator)

    Returns
    -------
    A list of the models output for each instance.
    """
    data_iterator.index_with(model.vocab)
    with torch.no_grad():
        return_val: List[Dict[str, numpy.ndarray]] = []
        cuda_device = model._get_prediction_device()
        for dataset in data_iterator._create_batches(instances, shuffle=False):
            batch_size = len(dataset.instances)
            dataset.index_instances(model.vocab)
            model_input = util.move_to_device(dataset.as_tensor_dict(),
                                              cuda_device)
            outputs = model.decode(model(**model_input))
            instance_separated_output: List[Dict[str, numpy.ndarray]] = [
                {} for _ in dataset.instances
            ]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable.
                    # This occurs with batch size 1, because we still want to include the loss in that case.
                    if output.dim() == 0:
                        output = output.unsqueeze(0)

                    if output.size(0) != batch_size:
                        model._maybe_warn_for_unseparable_batches(name)
                        continue
                    output = output.detach().cpu().numpy()
                elif len(output) != batch_size:
                    model._maybe_warn_for_unseparable_batches(name)
                    continue
                for instance_output, batch_element in zip(
                        instance_separated_output, output):
                    instance_output[name] = batch_element
            return_val.extend(instance_separated_output)
        return return_val
Ejemplo n.º 26
0
    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1,
                                             gradients_to_ignore: Set[str] = None,
                                             overrides: str = ""):
        """
        Parameters
        ----------
        param_file : ``str``
            Path to a training configuration file that we will use to train the model for this
            test.
        tolerance : ``float``, optional (default=1e-4)
            When comparing model predictions between the originally-trained model and the model
            after saving and loading, we will use this tolerance value (passed as ``rtol`` to
            ``numpy.testing.assert_allclose``).
        cuda_device : ``int``, optional (default=-1)
            The device to run the test on.
        gradients_to_ignore : ``Set[str]``, optional (default=None)
            This test runs a gradient check to make sure that we're actually computing gradients
            for all of the parameters in the model.  If you really want to ignore certain
            parameters when doing that check, you can pass their names here.  This is not
            recommended unless you're `really` sure you don't need to have non-zero gradients for
            those parameters (e.g., some of the beam search / state machine models have
            infrequently-used parameters that are hard to force the model to use in a small test).
        overrides : ``str``, optional (default = "")
            A JSON string that we will use to override values in the input parameter file.
        """
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file, save_dir, overrides=overrides)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
Ejemplo n.º 27
0
def find_learning_rate_model(params: Params, serialization_dir: str,
                             start_lr: float = 1e-5,
                             end_lr: float = 10,
                             num_batches: int = 100,
                             linear_steps: bool = False,
                             stopping_factor: float = None,
                             force: bool = False) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    Parameters
    ----------
    trainer: :class:`~allennlp.common.registrable.Registrable`
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr: ``float``
        Learning rate to start the search.
    end_lr: ``float``
        Learning rate upto which search is done.
    num_batches: ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force: ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is '
                                 f'not empty.')
    else:
        os.makedirs(serialization_dir, exist_ok=True)

    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  params=trainer_params,
                                  validation_data=None,
                                  validation_iterator=None)

    logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.')
    learning_rates, losses = search_learning_rate(trainer,
                                                  start_lr=start_lr,
                                                  end_lr=end_lr,
                                                  num_batches=num_batches,
                                                  linear_steps=linear_steps,
                                                  stopping_factor=stopping_factor)
    logger.info(f'Finished learning rate search.')
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
Ejemplo n.º 28
0
def find_learning_rate_model(
    params: Params,
    serialization_dir: str,
    start_lr: float = 1e-5,
    end_lr: float = 10,
    num_batches: int = 100,
    linear_steps: bool = False,
    stopping_factor: float = None,
    force: bool = False,
) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    # Parameters

    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr : ``float``
        Learning rate to start the search.
    end_lr : ``float``
        Learning rate upto which search is done.
    num_batches : ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps : ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor : ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force : ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    create_serialization_dir(params,
                             serialization_dir,
                             recover=False,
                             force=force)

    prepare_environment(params)

    cuda_device = params.params.get("trainer").get("cuda_device", -1)
    check_for_gpu(cuda_device)
    distributed_params = params.params.get("distributed", None)
    # See https://github.com/allenai/allennlp/issues/3658
    assert not distributed_params, "find-lr is not compatible with DistributedDataParallel."

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation),
    )
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        instances=(instance for key, dataset in all_datasets.items()
                   for instance in dataset
                   if key in datasets_for_vocab_creation),
    )

    model = Model.from_params(vocab=vocab, params=params.pop("model"))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets["train"]

    trainer_params = params.pop("trainer")

    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer_choice = trainer_params.pop("type", "default")
    if trainer_choice != "default":
        raise ConfigurationError(
            "currently find-learning-rate only works with the default Trainer")
    trainer: Trainer = TrainerBase.from_params(  # type: ignore
        model=model,
        serialization_dir=serialization_dir,
        iterator=iterator,
        train_data=train_data,
        validation_data=None,
        params=trainer_params,
        validation_iterator=None,
    )

    logger.info(
        f"Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations."
    )
    learning_rates, losses = search_learning_rate(
        trainer,
        start_lr=start_lr,
        end_lr=end_lr,
        num_batches=num_batches,
        linear_steps=linear_steps,
        stopping_factor=stopping_factor,
    )
    logger.info(f"Finished learning rate search.")
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses,
               os.path.join(serialization_dir, "lr-losses.png"))
Ejemplo n.º 29
0
def find_learning_rate_model(params: Params, serialization_dir: str,
                             start_lr: float = 1e-5,
                             end_lr: float = 10,
                             num_batches: int = 100,
                             linear_steps: bool = False,
                             stopping_factor: float = None,
                             force: bool = False) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr: ``float``
        Learning rate to start the search.
    end_lr: ``float``
        Learning rate upto which search is done.
    num_batches: ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force: ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is '
                                 f'not empty.')
    else:
        os.makedirs(serialization_dir, exist_ok=True)

    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)


    trainer_choice = trainer_params.pop("type", "default")
    if trainer_choice != "default":
        raise ConfigurationError("currently find-learning-rate only works with the default Trainer")
    trainer = Trainer.from_params(model=model,
                                  serialization_dir=serialization_dir,
                                  iterator=iterator,
                                  train_data=train_data,
                                  validation_data=None,
                                  params=trainer_params,
                                  validation_iterator=None)

    logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.')
    learning_rates, losses = search_learning_rate(trainer,
                                                  start_lr=start_lr,
                                                  end_lr=end_lr,
                                                  num_batches=num_batches,
                                                  linear_steps=linear_steps,
                                                  stopping_factor=stopping_factor)
    logger.info(f'Finished learning rate search.')
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
Ejemplo n.º 30
0
    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1):
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
Ejemplo n.º 31
0
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]],
                cuda_device: int, serialization_dir: str,
                filtering: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """

    SimpleRandom.set_seeds()

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                           sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                           sys.stderr)  # type: ignore
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)

    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    ds_params = params.pop('dataset_reader', {})
    dataset_reader = FEVERReader(db,
                                 sentence_level=ds_params.pop(
                                     "sentence_level", False),
                                 wiki_tokenizer=Tokenizer.from_params(
                                     ds_params.pop('wiki_tokenizer', {})),
                                 claim_tokenizer=Tokenizer.from_params(
                                     ds_params.pop('claim_tokenizer', {})),
                                 token_indexers=TokenIndexer.dict_from_params(
                                     ds_params.pop('token_indexers', {})),
                                 filtering=filtering)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    all_datasets = [train_data]
    datasets_in_vocab = ["train"]

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        all_datasets.append(validation_data)
        datasets_in_vocab.append("validation")
    else:
        validation_data = None

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_in_vocab))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        Dataset([
            instance for dataset in all_datasets
            for instance in dataset.instances
        ]))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))

    train_data.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)

    trainer_params = params.pop("trainer")
    if cuda_device is not None:
        trainer_params["cuda_device"] = cuda_device
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model
def predict(archive_folder, span_prediction_file, output_file, cuda_device):
    '''
    span_prediction_file (jsonl) needs atleast three fields 
        - doc_id, words: List[str], field: List[Tuple[start_index, end_index, type]]

    Return output_file (jsonl) - 
        {
            'doc_id' : str,
            'pairwise_coreference_scores' : List[(s_1, e_1), (s_2, e_2), float (3 sig. digits) in [0, 1]]
        }
    '''
    import_submodules("scirex")
    archive_file = os.path.join(archive_folder, "model.tar.gz")
    archive = load_archive(archive_file, cuda_device)
    model = archive.model
    model.eval()
    config = archive.config.duplicate()
    dataset_reader_params = config["dataset_reader"]
    dataset_reader_params.pop('type')
    dataset_reader = ScirexCoreferenceEvalReader.from_params(
        params=dataset_reader_params, field="ner")
    instances = dataset_reader.read(span_prediction_file)

    batch = Batch(instances)
    batch.index_instances(model.vocab)

    config['iterator'].pop('batch_size')
    data_iterator = DataIterator.from_params(config["iterator"],
                                             batch_size=1000)
    iterator = data_iterator(instances, num_epochs=1, shuffle=False)

    with open(output_file, "w") as f:
        documents = {}
        for batch in tqdm(iterator):
            with torch.no_grad():
                batch = nn_util.move_to_device(batch,
                                               cuda_device)  # Put on GPU.
                pred = model(**batch)
                decoded = model.decode(pred)

            metadata = decoded["metadata"]
            label_prob: List[float] = [
                float(x) for x in decoded["label_probs"]
            ]
            doc_ids: List[str] = [m["doc_id"] for m in metadata]
            span_premise = [m["span_premise"] for m in metadata]
            span_hypothesis = [m["span_hypothesis"] for m in metadata]
            fields = [m["field"] for m in metadata]
            assert len(set(fields)) == 1, breakpoint()

            for doc_id, span_p, span_h, p in zip(doc_ids, span_premise,
                                                 span_hypothesis, label_prob):
                if doc_id not in documents:
                    documents[doc_id] = {
                        "doc_id": doc_id,
                        "pairwise_coreference_scores": []
                    }

                documents[doc_id]["pairwise_coreference_scores"].append(
                    ((span_p[0], span_p[1]), (span_h[0], span_h[1]),
                     round(p, 4)))

        f.write("\n".join([json.dumps(x) for x in documents.values()]))
Ejemplo n.º 33
0
    def from_partial_objects(
        cls,
        serialization_dir: str,
        local_rank: int,
        batch_weight_key: str,
        dataset_reader: DatasetReader,
        train_data_path: str,
        model: Lazy[Model],
        iterator: DataIterator,
        trainer: Lazy[TrainerBase],
        vocabulary: Lazy[Vocabulary] = None,
        datasets_for_vocab_creation: List[str] = None,
        validation_dataset_reader: DatasetReader = None,
        validation_data_path: str = None,
        validation_iterator: DataIterator = None,
        test_data_path: str = None,
        evaluate_on_test: bool = False,
    ) -> "TrainModel":
        """
        This method is intended for use with our `FromParams` logic, to construct a `TrainModel`
        object from a config file passed to the `allennlp train` command.  The arguments to this
        method are the allowed top-level keys in a configuration file (except for the first three,
        which are obtained separately).

        You *could* use this outside of our `FromParams` logic if you really want to, but there
        might be easier ways to accomplish your goal than instantiating `Lazy` objects.  If you are
        writing your own training loop, we recommend that you look at the implementation of this
        method for inspiration and possibly some utility functions you can call, but you very likely
        should not use this method directly.

        The `Lazy` type annotations here are a mechanism for building dependencies to an object
        sequentially - the `TrainModel` object needs data, a model, and a trainer, but the model
        needs to see the data before it's constructed (to create a vocabulary) and the trainer needs
        the data and the model before it's constructed.  Objects that have sequential dependencies
        like this are labeled as `Lazy` in their type annotations, and we pass the missing
        dependencies when we call their `construct()` method, which you can see in the code below.

        # Parameters
        serialization_dir: `str`
            The directory where logs and model archives will be saved.
        local_rank: `int`
            The process index that is initialized using the GPU device id.
        batch_weight_key: `str`
            The name of metric used to weight the loss on a per-batch basis.
        dataset_reader: `DatasetReader`
            The `DatasetReader` that will be used for training and (by default) for validation.
        train_data_path: `str`
            The file (or directory) that will be passed to `dataset_reader.read()` to construct the
            training data.
        model: `Lazy[Model]`
            The model that we will train.  This is lazy because it depends on the `Vocabulary`;
            after constructing the vocabulary we call `model.construct(vocab=vocabulary)`.
        iterator: `DataIterator`
            The iterator we use to batch instances from the dataset reader at training and (by
            default) validation time.
        trainer: `Lazy[TrainerBase]`
            The `Trainer` that actually implements the training loop.  This is a lazy object because
            it depends on the model that's going to be trained.
        vocabulary: `Lazy[Vocabulary]`, optional (default=None)
            The `Vocabulary` that we will use to convert strings in the data to integer ids (and
            possibly set sizes of embedding matrices in the `Model`).  By default we construct the
            vocabulary from the instances that we read.
        datasets_for_vocab_creation: `List[str]`, optional (default=None)
            If you pass in more than one dataset but don't want to use all of them to construct a
            vocabulary, you can pass in this key to limit it.  Valid entries in the list are
            "train", "validation" and "test".
        validation_dataset_reader: `DatasetReader`, optional (default=None)
            If given, we will use this dataset reader for the validation data instead of
            `dataset_reader`.
        validation_data_path: `str`, optional (default=None)
            If given, we will use this data for computing validation metrics and early stopping.
        validation_iterator: `DataIterator`, optional (default=None)
            If given, we will use this iterator for batching and scheduling instances for the
            validation data, instead of `iterator`.
        test_data_path: `str`, optional (default=None)
            If given, we will use this as test data.  This makes it available for vocab creation by
            default, but nothing else.
        evaluate_on_test: `bool`, optional (default=False)
            If given, we will evaluate the final model on this data at the end of training.  Note
            that we do not recommend using this for actual test data in every-day experimentation;
            you should only very rarely evaluate your model on actual test data.
        """

        datasets = training_util.read_all_datasets(
            train_data_path=train_data_path,
            dataset_reader=dataset_reader,
            validation_dataset_reader=validation_dataset_reader,
            validation_data_path=validation_data_path,
            test_data_path=test_data_path,
        )

        if datasets_for_vocab_creation:
            for key in datasets_for_vocab_creation:
                if key not in datasets:
                    raise ConfigurationError(
                        f"invalid 'dataset_for_vocab_creation' {key}")

        instance_generator = (instance for key, dataset in datasets.items()
                              if not datasets_for_vocab_creation
                              or key in datasets_for_vocab_creation
                              for instance in dataset)

        vocabulary_ = vocabulary.construct(instances=instance_generator)
        if not vocabulary_:
            vocabulary_ = Vocabulary.from_instances(instance_generator)
        model_ = model.construct(vocab=vocabulary_)

        # Initializing the model can have side effect of expanding the vocabulary.
        # Save the vocab only in the master. In the degenerate non-distributed
        # case, we're trivially the master.
        if common_util.is_master():
            vocabulary_path = os.path.join(serialization_dir, "vocabulary")
            vocabulary_.save_to_files(vocabulary_path)

        iterator.index_with(model_.vocab)
        validation_iterator = validation_iterator or iterator
        validation_iterator.index_with(
            model_.vocab)  # it is ok to call this twice

        # We don't need to pass serialization_dir and local_rank here, because they will have been
        # passed through the trainer by from_params already, because they were keyword arguments to
        # construct this class in the first place.
        trainer_ = trainer.construct(
            model=model_,
            iterator=iterator,
            train_data=datasets["train"],
            validation_iterator=validation_iterator,
            validation_data=datasets.get("validation"),
        )

        return cls(
            serialization_dir=serialization_dir,
            model=model_,
            trainer=trainer_,
            evaluation_dataset=datasets.get("test"),
            evaluation_iterator=validation_iterator,
            evaluate_on_test=evaluate_on_test,
            batch_weight_key=batch_weight_key,
        )
Ejemplo n.º 34
0
def predict(archive_folder, span_file, cluster_file, output_file, cuda_device):
    combine_span_and_cluster_file(span_file, cluster_file)

    test_file = 'tmp_relation_42424242.jsonl'
    relation_threshold = json.load(
        open(archive_folder +
             '/metrics.json'))['test__n_ary_rel_global_threshold']
    print(relation_threshold)

    import_submodules("scirex")
    logging.info("Loading Model from %s", archive_folder)
    archive_file = os.path.join(archive_folder, "model.tar.gz")
    archive = load_archive(archive_file, cuda_device)
    model = archive.model
    model.eval()

    model.prediction_mode = True
    config = archive.config.duplicate()
    dataset_reader_params = config["dataset_reader"]
    dataset_reader = DatasetReader.from_params(dataset_reader_params)
    dataset_reader.prediction_mode = True
    instances = dataset_reader.read(test_file)

    for instance in instances:
        batch = Batch([instance])
        batch.index_instances(model.vocab)

    data_iterator = DataIterator.from_params(config["validation_iterator"])
    iterator = data_iterator(instances, num_epochs=1, shuffle=False)

    with open(output_file, "w") as f:
        documents = {}
        for batch in tqdm(iterator):
            with torch.no_grad():
                batch = nn_util.move_to_device(batch, cuda_device)
                output_res = model.decode_relations(batch)

            n_ary_relations = output_res['n_ary_relation']
            predicted_relations, scores = n_ary_relations[
                'candidates'], n_ary_relations['scores']

            if 'metadata' not in output_res['n_ary_relation']:
                continue

            metadata = output_res['n_ary_relation']['metadata'][0]
            doc_id = metadata['doc_id']
            coref_key_map = {
                k: i
                for i, k in metadata['document_metadata']
                ['cluster_name_to_id'].items()
            }

            for i, rel in enumerate(predicted_relations):
                predicted_relations[i] = tuple([
                    coref_key_map[k] if k in coref_key_map else None
                    for k in rel
                ])

            if doc_id not in documents:
                documents[doc_id] = {
                    'predicted_relations': [],
                    'doc_id': doc_id
                }

            label = [
                1 if x > relation_threshold else 0
                for x in list(scores.ravel())
            ]
            scores = [round(float(x), 4) for x in list(scores.ravel())]
            documents[doc_id]['predicted_relations'] += list(
                zip(predicted_relations, scores, label))

        for d in documents.values():
            predicted_relations = {}
            for r, s, l in d['predicted_relations']:
                r = tuple(r)
                if r not in predicted_relations or predicted_relations[r][
                        0] < s:
                    predicted_relations[r] = (s, l)

            d['predicted_relations'] = [
                (r, s, l) for r, (s, l) in predicted_relations.items()
            ]

        f.write("\n".join([json.dumps(x) for x in documents.values()]))