Beispiel #1
0
    def from_archive(cls,
                     generative_archive: Archive,
                     discriminative_archive: Archive,
                     predictor_name: str = None) -> 'Predictor':
        """
        Instantiate a :class:`CompleteTheSentencePredictor` from a :class:`~allennlp.models.archival.Archive`;
        that is, from the result of training a model. Optionally specify which `Predictor`
        subclass; otherwise, the default one for the model will be used.
        """
        # We need to duplicate the configs so that they do not get consumed inside the archive
        generative_config = generative_archive.config.duplicate()
        discriminative_config = discriminative_archive.config.duplicate()

        model = generative_archive.model
        sampler = discriminative_archive.model

        dataset_reader_params_model = generative_config['dataset_reader']
        dataset_reader_params_sampler = discriminative_config['dataset_reader']

        dataset_reader_model = DatasetReader.from_params(
            dataset_reader_params_model)
        dataset_reader_sampler = DatasetReader.from_params(
            dataset_reader_params_sampler)

        model.eval()
        sampler.eval()

        return Predictor.by_name(predictor_name)(model, sampler,
                                                 dataset_reader_model,
                                                 dataset_reader_sampler)
Beispiel #2
0
def read_data(
        reader: DatasetReader
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read(TRAIN_PATH)
    validation_data = reader.read(DEV_PATH)
    return training_data, validation_data
Beispiel #3
0
def read_data(
        reader: DatasetReader
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read("/path/to/your/training/data")
    validation_data = reader.read("/path/to/your/validation/data")
    return training_data, validation_data
Beispiel #4
0
def read_data(
        reader: DatasetReader
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read("quick_start/data/movie_review/train.tsv")
    validation_data = reader.read("quick_start/data/movie_review/dev.tsv")
    return training_data, validation_data
Beispiel #5
0
def read_data(reader: DatasetReader, tgt_domain: str, input_path: str,
              domains: List) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")

    training_data = None
    for domain in domains:
        if domain != tgt_domain:
            if training_data == None:
                training_data = reader.read(input_path + domain + '/' +
                                            domain + '_neg.txt')
            else:
                training_data += reader.read(input_path + domain + '/' +
                                             domain + '_neg.txt')

    valid_test_data = reader.read(input_path + tgt_domain + '/' + tgt_domain +
                                  '_neg.txt')

    as_per_percent = int(len(valid_test_data) * 0.25)
    valid_size = 2000 if as_per_percent >= 2000 else as_per_percent

    validation_data = valid_test_data[:valid_size]
    test_data = valid_test_data[valid_size:]

    training_data = AllennlpDataset(training_data)
    validation_data = AllennlpDataset(validation_data)
    test_data = AllennlpDataset(test_data)

    print("train:", len(training_data), "validation:", len(validation_data),
          "test:", len(test_data))
    return training_data, validation_data, test_data
Beispiel #6
0
def read_data(
    reader: DatasetReader,
    train_data_path:
    str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/train/listfile.csv",
    valid_data_path:
    str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/test/listfile.csv"
) -> Tuple[Iterable[Instance], Iterable[Instance]]:

    logger.critical("Reading the data. Lazy variable set to {}".format(
        reader.lazy))
    start_time = time.time()
    '''Expect: that this is the only time it is called'''
    reader.mode = "train"
    training_data = reader.read(train_data_path)

    # instead, we will set the examples differently here
    reader.mode = "valid"
    validation_data = reader.read(
        valid_data_path)  #need to unlimit the examples here...

    logger.critical(
        "Finished the call to read the data. Time took {}".format(time.time() -
                                                                  start_time))

    return training_data, validation_data
Beispiel #7
0
def read_data(
        reader: DatasetReader
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read("data/train.tsv")
    validation_data = reader.read("data/dev.tsv")
    return training_data, validation_data
def read_data(
        reader: DatasetReader
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read("./data/sample/train_dataset.json")
    validation_data = reader.read(
        "./data/sample/train_dataset.json")  # TODO: same data
    return training_data, validation_data
def main(span_model_path: str, span_to_question_model_path: str,
         cuda_device: int, input_file: str, output_file: str,
         span_min_prob: float, question_min_prob: float,
         question_beam_size: int) -> None:

    check_for_gpu(cuda_device)

    span_model_archive = load_archive_from_folder(
        span_model_path,
        cuda_device=cuda_device,
        overrides=
        '{ "model": { "span_selector": {"span_decoding_threshold": 0.00} } }',
        weights_file=os.path.join(span_model_path, "best.th"))

    # override span detection threshold to be low enough so we can reasonably approximate bad spans
    # as having probability 0.
    span_to_question_model_archive = load_archive_from_folder(
        span_to_question_model_path,
        cuda_device=cuda_device,
        weights_file=os.path.join(span_to_question_model_path, "best.th"))

    span_model_dataset_reader_params = span_model_archive.config[
        "dataset_reader"].duplicate()
    span_model_dataset_reader_params["qasrl_filter"]["allow_all"] = True

    span_to_question_model_dataset_reader_params = span_to_question_model_archive.config[
        "dataset_reader"].duplicate()
    span_to_question_model_dataset_reader_params["qasrl_filter"][
        "allow_all"] = True

    pipeline = AFirstPipelineSequential(
        span_model=span_model_archive.model,
        span_model_dataset_reader=DatasetReader.from_params(
            span_model_dataset_reader_params),
        span_to_question_model=span_to_question_model_archive.model,
        span_to_question_model_dataset_reader=DatasetReader.from_params(
            span_to_question_model_dataset_reader_params),
        span_minimum_threshold=span_min_prob,
        question_minimum_threshold=question_min_prob,
        question_beam_size=question_beam_size)
    if output_file is None:
        for line in tqdm(read_lines(cached_path(input_file))):
            input_json = json.loads(line)
            output_json = pipeline.predict(input_json)
            print(json.dumps(output_json))
    elif output_file.endswith('.gz'):
        with gzip.open(output_file, 'wt') as f:
            for line in tqdm(read_lines(cached_path(input_file))):
                input_json = json.loads(line)
                output_json = pipeline.predict(input_json)
                f.write(json.dumps(output_json))
                f.write('\n')
    else:
        with open(output_file, 'w', encoding='utf8') as out:
            for line in tqdm(read_lines(cached_path(input_file))):
                input_json = json.loads(line)
                output_json = pipeline.predict(input_json)
                print(json.dumps(output_json), file=out)
Beispiel #10
0
def read_data(
    reader: DatasetReader,
        train_data_path: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/train/listfile.csv",
        valid_data_path: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/test/listfile.csv"
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read(train_data_path)
    validation_data = reader.read(valid_data_path)
    return training_data, validation_data
def read_data(reader: DatasetReader) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    
    training_data = reader.read('../data/snips/utterances_train_features.txt')
    validation_data = reader.read('../data/snips/utterances_valid_features.txt')
    
    training_data = AllennlpDataset(training_data)
    validation_data = AllennlpDataset(validation_data)
    
    print("train:",len(training_data), "validation:", len(validation_data))
    return training_data, validation_data
Beispiel #12
0
def evaluate_from_args(args: argparse.Namespace):
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader', None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources: Dict[str, str] = (json.loads(args.embedding_sources_mapping)
                                         if args.embedding_sources_mapping else {})
    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(Params({}), instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    csv_writer = csv.writer(args.output_file)

    keys = None
    for instance in instances:
        metrics = evaluate(model, [instance], iterator, args.cuda_device, args.batch_weight_key)

        if keys is None:
            keys = sorted(metrics.keys())
            csv_writer.writerow(['instance_id', *keys])

        instance_id = instance.fields['metadata']['id']

        values = [metrics[key] for key in keys]
        csv_writer.writerow([instance_id, *values])
Beispiel #13
0
 def test_from_params(self):
     squad1_reader = DatasetReader.from_params(Params({"type": "squad1"}))
     assert squad1_reader.no_answer_token is None
     squad2_reader = DatasetReader.from_params(Params({"type": "squad2"}))
     assert squad2_reader.no_answer_token is not None
     with pytest.warns(DeprecationWarning):
         squad_reader = DatasetReader.from_params(
             Params({
                 "type": "squad1",
                 "skip_invalid_examples": True
             }))
         assert squad_reader.skip_impossible_questions is True
Beispiel #14
0
    def __init__(self,
                 question_model_archive: Archive,
                 # question_model_dataset_reader: QasrlReader,
                 question_to_span_model_archive: QuestionToSpanModel,
                 # question_to_span_model_dataset_reader: QasrlReader,
                 tan_model_archive: Optional[Archive] = None,
                 span_to_tan_model_archive: Optional[Archive] = None,
                 animacy_model_archive: Optional[Archive] = None,
                 question_minimum_threshold: float = question_minimum_threshold_default,
                 span_minimum_threshold: float = span_minimum_threshold_default,
                 tan_minimum_threshold: float = tan_minimum_threshold_default,
                 question_beam_size: int = question_beam_size_default,
                 clause_mode: bool = False) -> None:
        self._question_model = question_model_archive.model
        self._question_model_dataset_reader = DatasetReader.from_params(question_model_archive.config["dataset_reader"].duplicate())
        print("Question model loaded.", flush = True)
        self._question_to_span_model = question_to_span_model_archive.model
        self._question_to_span_model_dataset_reader = DatasetReader.from_params(question_to_span_model_archive.config["dataset_reader"].duplicate())
        print("Question-to-span model loaded.", flush = True)
        if tan_model_archive is not None:
            self._tan_model = tan_model_archive.model
            self._tan_model_dataset_reader = DatasetReader.from_params(tan_model_archive.config["dataset_reader"].duplicate())
            print("TAN model loaded.", flush = True)
        else:
            self._tan_model = None
        if span_to_tan_model_archive is not None:
            self._span_to_tan_model = span_to_tan_model_archive.model
            self._span_to_tan_model_dataset_reader = DatasetReader.from_params(span_to_tan_model_archive.config["dataset_reader"].duplicate())
            print("Span-to-TAN model loaded.", flush = True)
        else:
            self._span_to_tan_model = None
        if animacy_model_archive is not None:
            self._animacy_model = animacy_model_archive.model
            self._animacy_model_dataset_reader = DatasetReader.from_params(animacy_model_archive.config["dataset_reader"].duplicate())
            print("Animacy model loaded.", flush = True)
        else:
            self._animacy_model = None
        print("All models loaded.", flush = True)

        self._span_minimum_threshold = span_minimum_threshold
        self._question_minimum_threshold = question_minimum_threshold
        self._tan_minimum_threshold = tan_minimum_threshold
        self._question_beam_size = question_beam_size
        self._clause_mode = clause_mode

        qg_slots = set(self._question_model.get_slot_names())
        qa_slots = set(self._question_to_span_model.get_slot_names())
        if not qa_slots.issubset(qg_slots):
            raise ConfigurationError(
                "Question Answerer must read in a subset of question slots generated by the Question Generator. " + \
                ("QG slots: %s; QA slots: %s" % (qg_slots, qa_slots)))
Beispiel #15
0
def read_data(
    train_path: str,
    val_path: str,
    train_reader: DatasetReader,
    val_reader: DatasetReader = None
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    print(type(train_reader), train_path)
    training_data = train_reader.read(train_path)
    if val_reader is None:
        validation_data = train_reader.read(val_path)
    else:
        validation_data = val_reader.read(val_path)
    return training_data, validation_data
Beispiel #16
0
 def setUp(self):
     super(TestCopyNetReader, self).setUp()
     params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json")
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv")
     self.instances = ensure_list(instances)
     self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
Beispiel #17
0
    def test_batch_predictions_are_consistent(self):
        # The CNN encoder has problems with this kind of test - it's not properly masked yet, so
        # changing the amount of padding in the batch will result in small differences in the
        # output of the encoder.  Because BiDAF is so deep, these differences get magnified through
        # the network and make this test impossible.  So, we'll remove the CNN encoder entirely
        # from the model for this test.  If/when we fix the CNN encoder to work correctly with
        # masking, we can change this back to how the other models run this test, with just a
        # single line.
        # pylint: disable=protected-access,attribute-defined-outside-init

        # Save some state.
        saved_model = self.model
        saved_instances = self.instances

        # Modify the state, run the test with modified state.
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        reader._token_indexers = {'tokens': reader._token_indexers['tokens']}
        self.instances = reader.read('tests/fixtures/data/squad.json')
        vocab = Vocabulary.from_instances(self.instances)
        for instance in self.instances:
            instance.index_fields(vocab)
        del params['model']['text_field_embedder']['token_characters']
        params['model']['phrase_layer']['input_size'] = 2
        self.model = Model.from_params(vocab, params['model'])

        self.ensure_batch_predictions_are_consistent()

        # Restore the state.
        self.model = saved_model
        self.instances = saved_instances
Beispiel #18
0
    def test_batch_predictions_are_consistent(self):
        # The CNN encoder has problems with this kind of test - it's not properly masked yet, so
        # changing the amount of padding in the batch will result in small differences in the
        # output of the encoder.  Because BiDAF is so deep, these differences get magnified through
        # the network and make this test impossible.  So, we'll remove the CNN encoder entirely
        # from the model for this test.  If/when we fix the CNN encoder to work correctly with
        # masking, we can change this back to how the other models run this test, with just a
        # single line.
        # pylint: disable=protected-access,attribute-defined-outside-init

        # Save some state.
        saved_model = self.model
        saved_instances = self.instances

        # Modify the state, run the test with modified state.
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        reader._token_indexers = {'tokens': reader._token_indexers['tokens']}
        self.instances = reader.read(self.FIXTURES_ROOT / 'data' /
                                     'squad.json')
        vocab = Vocabulary.from_instances(self.instances)
        for instance in self.instances:
            instance.index_fields(vocab)
        del params['model']['text_field_embedder']['token_embedders'][
            'token_characters']
        params['model']['phrase_layer']['input_size'] = 2
        self.model = Model.from_params(vocab=vocab, params=params['model'])

        self.ensure_batch_predictions_are_consistent()

        # Restore the state.
        self.model = saved_model
        self.instances = saved_instances
Beispiel #19
0
    def from_archive(cls,
                     archive: Archive,
                     predictor_name: str = None) -> 'Predictor':
        """
        Instantiate a :class:`Predictor` from an :class:`~allennlp.models.archival.Archive`;
        that is, from the result of training a model. Optionally specify which `Predictor`
        subclass; otherwise, the default one for the model will be used.
        """
        # Duplicate the config so that the config inside the archive doesn't get consumed
        config = archive.config.duplicate()

        if not predictor_name:
            model_type = config.get("model").get("type")
            if not model_type in DEFAULT_PREDICTORS:
                raise ConfigurationError(f"No default predictor for model type {model_type}.\n"\
                                         f"Please specify a predictor explicitly.")
            predictor_name = DEFAULT_PREDICTORS[model_type]

        dataset_reader_params = config["dataset_reader"]
        dataset_reader = DatasetReader.from_params(dataset_reader_params)

        model = archive.model
        model.eval()

        return Predictor.by_name(predictor_name)(model, dataset_reader)
Beispiel #20
0
    def test_create_models_from_allennlp_configs(self, config_path):

        params = Params.from_file(
            str(config_path),
            ext_vars={
                "CLF_TRAIN_DATA_PATH": "",
                "CLF_VALID_DATA_PATH": "",
                "DISCRETIZER_PATH": str(DISCRETIZER_PATH),
                "VOCAB_PATH": str(VOCAB_PATH),
            },
        )

        reader = DatasetReader.from_params(params["dataset_reader"])

        instances = reader.read(DATA_PATH)
        vocab = Vocabulary.from_instances(instances)
        num_labels = vocab.get_vocab_size(namespace="labels")

        batch = Batch(instances)
        batch.index_instances(vocab)

        try:
            model = Model.from_params(params=params["model"], vocab=vocab)
        except Exception as e:
            raise AssertionError(f"unable to load params from {config_path}") from e

        output_dict = model(**batch.as_tensor_dict())

        assert "probs" in output_dict
        assert len(output_dict["probs"].shape) == 2
        assert output_dict["probs"].shape[0] == len(instances)
        assert output_dict["probs"].shape[1] == num_labels
Beispiel #21
0
 def test_from_params(self, data_path: str, sentence_marker_params: Params, ccm_params: Params) -> None:
     reader = DatasetReader.from_params(sentence_marker_params)
     instances = reader.read(data_path)
     vocab = Vocabulary.from_instances(instances)
     ccm_module = ConstrainedConditionalModule.from_params(vocab=vocab, params=ccm_params)
     index = vocab.get_token_index("I-type", "labels")
     assert ccm_module._sentence_penalty_map == (index, 50.)
Beispiel #22
0
    def test_batch_predictions_are_consistent(self):
        # The same issue as the bidaf test case.
        # The CNN encoder has problems with this kind of test - it's not properly masked yet, so
        # changing the amount of padding in the batch will result in small differences in the
        # output of the encoder. So, we'll remove the CNN encoder entirely from the model for this test.
        # Save some state.
        # pylint: disable=protected-access,attribute-defined-outside-init
        saved_model = self.model
        saved_instances = self.instances

        # Modify the state, run the test with modified state.
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        reader._token_indexers = {'tokens': reader._token_indexers['tokens']}
        self.instances = reader.read(self.FIXTURES_ROOT / 'data' /
                                     'squad.json')
        vocab = Vocabulary.from_instances(self.instances)
        for instance in self.instances:
            instance.index_fields(vocab)
        del params['model']['text_field_embedder']['token_embedders'][
            'token_characters']
        params['model']['phrase_layer']['num_convs_per_block'] = 0
        params['model']['modeling_layer']['num_convs_per_block'] = 0
        self.model = Model.from_params(vocab=vocab, params=params['model'])

        self.ensure_batch_predictions_are_consistent()

        # Restore the state.
        self.model = saved_model
        self.instances = saved_instances
Beispiel #23
0
 def setUp(self):
     super().setUp()
     params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json")
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv")
     self.instances = ensure_list(instances)
     self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
Beispiel #24
0
    def from_archive(
        cls,
        archive: Archive,
        predictor_name: str = None,
        dataset_reader_to_load: str = "validation",
        frozen: bool = True,
        language: str = "en_core_web_sm",
        restrict_frames: bool = False,
        restrict_roles: bool = False,
    ) -> "Predictor":
        # Duplicate the config so that the config inside the archive doesn't get consumed
        config = archive.config.duplicate()

        if not predictor_name:
            model_type = config.get("model").get("type")
            model_class, _ = Model.resolve_class_name(model_type)
            predictor_name = model_class.default_predictor
        predictor_class: Type[Predictor] = (
            Predictor.by_name(predictor_name) if predictor_name is not None else cls  # type: ignore
        )

        if dataset_reader_to_load == "validation" and "validation_dataset_reader" in config:
            dataset_reader_params = config["validation_dataset_reader"]
        else:
            dataset_reader_params = config["dataset_reader"]
        dataset_reader = DatasetReader.from_params(dataset_reader_params)

        model = archive.model
        if frozen:
            model.restrict_frames = restrict_frames
            model.restrict_roles = restrict_roles
            model.eval()

        return predictor_class(model, dataset_reader, language)
Beispiel #25
0
    def test_batch_predictions_are_consistent(self):
        # The same issue as the bidaf test case.
        # The CNN encoder has problems with this kind of test - it's not properly masked yet, so
        # changing the amount of padding in the batch will result in small differences in the
        # output of the encoder. So, we'll remove the CNN encoder entirely from the model for this test.
        # Save some state.

        saved_model = self.model
        saved_instances = self.instances

        # Modify the state, run the test with modified state.
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params["dataset_reader"])
        reader._token_indexers = {"tokens": reader._token_indexers["tokens"]}
        self.instances = reader.read(FIXTURES_ROOT / "data" / "squad.json")
        vocab = Vocabulary.from_instances(self.instances)
        for instance in self.instances:
            instance.index_fields(vocab)
        del params["model"]["text_field_embedder"]["token_embedders"][
            "token_characters"]
        params["model"]["phrase_layer"]["num_convs_per_block"] = 0
        params["model"]["modeling_layer"]["num_convs_per_block"] = 0
        self.model = Model.from_params(vocab=vocab, params=params["model"])

        self.ensure_batch_predictions_are_consistent()

        # Restore the state.
        self.model = saved_model
        self.instances = saved_instances
Beispiel #26
0
def write_for_official_eval(model_archive_file, test_file, output_file,
                            label_ids_to_label):
    archive = load_archive(model_archive_file)
    model = archive.model

    reader = DatasetReader.from_params(archive.config['dataset_reader'])

    iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 4}))
    vocab = Vocabulary.from_params(archive.config['vocabulary'])
    iterator.index_with(vocab)

    model.cuda()
    model.eval()

    instances = reader.read(test_file)
    predictions = []
    for batch in iterator(instances, num_epochs=1, shuffle=False):
        batch = move_to_device(batch, cuda_device=0)
        output = model(**batch)

        batch_labels = [
            label_ids_to_label[i]
            for i in output['predictions'].cpu().numpy().tolist()
        ]

        predictions.extend(batch_labels)


    with open(output_file, 'w') as fout:
        for p in predictions:
            fout.write("{}\n".format(p))
Beispiel #27
0
    def from_archive(
        cls,
        archive: Archive,
        predictor_name: str = None,
        dataset_reader_to_load: str = "validation",
    ) -> "Predictor":
        """
        Instantiate a :class:`Predictor` from an :class:`~allennlp.models.archival.Archive`;
        that is, from the result of training a model. Optionally specify which `Predictor`
        subclass; otherwise, we try to find a corresponding predictor in `DEFAULT_PREDICTORS`, or if
        one is not found, the base class (i.e. :class:`Predictor`) will be used. Optionally specify
        which :class:`DatasetReader` should be loaded; otherwise, the validation one will be used
        if it exists followed by the training dataset reader.
        """
        # Duplicate the config so that the config inside the archive doesn't get consumed
        config = archive.config.duplicate()

        if not predictor_name:
            model_type = config.get("model").get("type")
            if model_type in DEFAULT_PREDICTORS:
                predictor_name = DEFAULT_PREDICTORS[model_type]
        predictor_class: Type[Predictor] = Predictor.by_name(  # type: ignore
            predictor_name
        ) if predictor_name is not None else cls

        if dataset_reader_to_load == "validation" and "validation_dataset_reader" in config:
            dataset_reader_params = config["validation_dataset_reader"]
        else:
            dataset_reader_params = config["dataset_reader"]
        dataset_reader = DatasetReader.from_params(dataset_reader_params)

        model = archive.model
        model.eval()

        return predictor_class(model, dataset_reader)
Beispiel #28
0
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        # The dataset reader might be lazy, but a lazy list here breaks some of our tests.
        instances = list(reader.read(str(dataset_file)))
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if 'vocabulary' in params:
            vocab_params = params['vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params,
                                           instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(vocab=self.vocab,
                                       params=params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)
Beispiel #29
0
 def __init__(self) -> None:
     super().__init__(lazy=True)
     self.reader = DatasetReader.from_params(
         Params({
             "type": "sequence_tagging",
             "lazy": True
         }))
Beispiel #30
0
def get_reader():
    params = {
        "type": "kg_probe",
        "tokenizer_and_candidate_generator": {
            "type": "bert_tokenizer_and_candidate_generator",
            "entity_candidate_generators": {
                "wordnet": {"type": "wordnet_mention_generator",
                            "entity_file": "tests/fixtures/wordnet/entities_fixture.jsonl"}
            },
            "entity_indexers":  {
                "wordnet": {
                       "type": "characters_tokenizer",
                       "tokenizer": {
                           "type": "word",
                           "word_splitter": {"type": "just_spaces"},
                       },
                       "namespace": "entity"
                    }
            },
            "bert_model_type": "tests/fixtures/bert/vocab.txt",
            "do_lower_case": True,
        },
    }

    return DatasetReader.from_params(Params(params))
Beispiel #31
0
 def __init__(self) -> None:
     super().__init__(lazy=True)
     self.reader = DatasetReader.from_params(
         Params({
             'type': 'sequence_tagging',
             'lazy': True
         }))
Beispiel #32
0
    def set_up_model(self, param_file, dataset_file):

        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params["dataset_reader"])
        # The dataset reader might be lazy, but a lazy list here breaks some of our tests.
        instances = reader.read(str(dataset_file))
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(params=vocab_params,
                                           instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.instances.index_with(vocab)
        self.model = Model.from_params(vocab=self.vocab,
                                       params=params["model"])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(list(self.instances))
        self.dataset.index_instances(self.vocab)
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        instances = reader.read(dataset_file)
        vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(self.vocab, params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)
def main(serialization_directory, device):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    """

    config = Params.from_file(os.path.join(serialization_directory, "config.json"))
    dataset_reader = DatasetReader.from_params(config['dataset_reader'])
    evaluation_data_path = config['validation_data_path']

    model = Model.load(config, serialization_dir=serialization_directory, cuda_device=device)

    prediction_file_path = os.path.join(serialization_directory, "predictions.txt")
    gold_file_path = os.path.join(serialization_directory, "gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("Reading evaluation data from {}".format(evaluation_data_path))
    instances = dataset_reader.read(evaluation_data_path)
    iterator = BasicIterator(batch_size=32)
    iterator.index_with(model.vocab)

    model_predictions = []
    batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device, for_training=False)
    for batch in Tqdm.tqdm(batches):
        result = model(**batch)
        predictions = model.decode(result)
        model_predictions.extend(predictions["tags"])

    for instance, prediction in zip(instances, model_predictions):
        fields = instance.fields
        try:
            # Most sentences have a verbal predicate, but not all.
            verb_index = fields["verb_indicator"].labels.index(1)
        except ValueError:
            verb_index = None

        gold_tags = fields["tags"].labels
        sentence = fields["tokens"].tokens

        write_to_conll_eval_file(prediction_file, gold_file,
                                 verb_index, sentence, prediction, gold_tags)
    prediction_file.close()
    gold_file.close()
Beispiel #35
0
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        instances = reader.read(dataset_file)
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if 'vocabulary' in params:
            vocab_params = params['vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(vocab=self.vocab, params=params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)
Beispiel #36
0
    def from_archive(cls, archive: Archive, predictor_name: str = None) -> 'Predictor':
        """
        Instantiate a :class:`Predictor` from an :class:`~allennlp.models.archival.Archive`;
        that is, from the result of training a model. Optionally specify which `Predictor`
        subclass; otherwise, the default one for the model will be used.
        """
        # Duplicate the config so that the config inside the archive doesn't get consumed
        config = archive.config.duplicate()

        if not predictor_name:
            model_type = config.get("model").get("type")
            if not model_type in DEFAULT_PREDICTORS:
                raise ConfigurationError(f"No default predictor for model type {model_type}.\n"\
                                         f"Please specify a predictor explicitly.")
            predictor_name = DEFAULT_PREDICTORS[model_type]

        dataset_reader_params = config["dataset_reader"]
        dataset_reader = DatasetReader.from_params(dataset_reader_params)

        model = archive.model
        model.eval()

        return Predictor.by_name(predictor_name)(model, dataset_reader)
def main(serialization_directory: int,
         device: int,
         data: str,
         prefix: str,
         domain: str = None):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    data: str, default = None
        The data to evaluate on. By default, we use the validation data from
        the original experiment.
    prefix: str, default=""
        The prefix to prepend to the generated gold and prediction files, to distinguish
        different models/data.
    domain: str, optional (default = None)
        If passed, filters the ontonotes evaluation/test dataset to only contain the
        specified domain. This overwrites the domain in the config file from the model,
        to allow evaluation on domains other than the one the model was trained on.
    """
    config = Params.from_file(os.path.join(serialization_directory, "config.json"))

    if domain is not None:
        # Hack to allow evaluation on different domains than the
        # model was trained on.
        config["dataset_reader"]["domain_identifier"] = domain
        prefix = f"{domain}_{prefix}"
    else:
        config["dataset_reader"].pop("domain_identifier", None)

    dataset_reader = DatasetReader.from_params(config['dataset_reader'])
    evaluation_data_path = data if data else config['validation_data_path']

    archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device)
    model = archive.model
    model.eval()

    prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt")
    gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("reading evaluation data from {}".format(evaluation_data_path))
    instances = dataset_reader.read(evaluation_data_path)

    with torch.autograd.no_grad():
        iterator = BasicIterator(batch_size=32)
        iterator.index_with(model.vocab)

        model_predictions = []
        batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device)
        for batch in Tqdm.tqdm(batches):
            result = model(**batch)
            predictions = model.decode(result)
            model_predictions.extend(predictions["tags"])

        for instance, prediction in zip(instances, model_predictions):
            fields = instance.fields
            try:
                # Most sentences have a verbal predicate, but not all.
                verb_index = fields["verb_indicator"].labels.index(1)
            except ValueError:
                verb_index = None

            gold_tags = fields["tags"].labels
            sentence = [x.text for x in fields["tokens"].tokens]

            write_to_conll_eval_file(prediction_file, gold_file,
                                     verb_index, sentence, prediction, gold_tags)
        prediction_file.close()
        gold_file.close()
Beispiel #38
0
 def __init__(self) -> None:
     super().__init__(lazy=True)
     self.reader = DatasetReader.from_params(Params({'type': 'sequence_tagging'}))
Beispiel #39
0
    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1,
                                             gradients_to_ignore: Set[str] = None,
                                             overrides: str = ""):
        """
        Parameters
        ----------
        param_file : ``str``
            Path to a training configuration file that we will use to train the model for this
            test.
        tolerance : ``float``, optional (default=1e-4)
            When comparing model predictions between the originally-trained model and the model
            after saving and loading, we will use this tolerance value (passed as ``rtol`` to
            ``numpy.testing.assert_allclose``).
        cuda_device : ``int``, optional (default=-1)
            The device to run the test on.
        gradients_to_ignore : ``Set[str]``, optional (default=None)
            This test runs a gradient check to make sure that we're actually computing gradients
            for all of the parameters in the model.  If you really want to ignore certain
            parameters when doing that check, you can pass their names here.  This is not
            recommended unless you're `really` sure you don't need to have non-zero gradients for
            those parameters (e.g., some of the beam search / state machine models have
            infrequently-used parameters that are hard to force the model to use in a small test).
        overrides : ``str``, optional (default = "")
            A JSON string that we will use to override values in the input parameter file.
        """
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file, save_dir, overrides=overrides)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1):
        save_dir = os.path.join(self.TEST_DIR, "save_and_load_test")
        archive_file = os.path.join(save_dir, "model.tar.gz")
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
if (load_pretrained_BiDAF):
    archive = load_archive("https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-model-2017.09.15-charpad.tar.gz")
    
    # Get the model and the config file
    model = archive.model
    config = archive.config.duplicate()
    
    keys_config = list(config.keys())
    print ("Key config list: ", keys_config)
    for key in keys_config:
        print ("Params of %s"%(key))
        print (config[key])
    ### Get the elements
    ## Data Readers ##
    dataset_reader_params = config["dataset_reader"]
    dataset_reader = DatasetReader.from_params(dataset_reader_params)
    ## Vocabulary ##
    vocab = model.vocab 

    """
    ############  Propagate an instance text #############
    """
    instance = dataset_reader.text_to_instance("What kind of test succeeded on its first attempt?", 
                                               "One time I was writing a unit test, and it succeeded on the first attempt.", 
                                               char_spans=[(6, 10)])
    
    print ("Keys instance: ", instance.fields.keys())
    
    # Batch intances and convert to index using the vocabulary.
    instances = [instance]
    dataset = Batch(instances)