Python TextClassificationJsonReader Examples, allennlp.data.dataset_readers.TextClassificationJsonReader Python Examples

Example #1

0

Show file

File: text_classification_json_test.py Project: shenyong123/GEML-MDG

    def test_read_from_file_ag_news_corpus_and_truncates_properly(self, lazy):
        reader = TextClassificationJsonReader(lazy=lazy, max_sequence_length=5)
        ag_path = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "ag_news_corpus.jsonl"
        )
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ["Memphis", "Rout", "Still", "Stings", "for"], "label": "2"}
        instance2 = {"tokens": ["AP", "-", "Eli", "Manning", "has"], "label": "2"}
        instance3 = {"tokens": ["A", "conference", "dedicated", "to", "online"], "label": "4"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
        assert fields["label"].label == instance3["label"]

Example #2

0

Show file

File: text_classification_json_test.py Project: shenyong123/GEML-MDG

    def test_set_skip_indexing_true(self, lazy):
        reader = TextClassificationJsonReader(lazy=lazy, skip_label_indexing=True)
        ag_path = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "integer_labels.jsonl"
        )
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ["This", "text", "has", "label", "0"], "label": 0}
        instance2 = {"tokens": ["This", "text", "has", "label", "1"], "label": 1}

        assert len(instances) == 2
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]

        with pytest.raises(ValueError) as exec_info:
            ag_path = (
                AllenNlpTestCase.FIXTURES_ROOT
                / "data"
                / "text_classification_json"
                / "imdb_corpus.jsonl"
            )
            ensure_list(reader.read(ag_path))
        assert str(exec_info.value) == "Labels must be integers if skip_label_indexing is True."

Example #3

0

Show file

File: dataset_reader_test.py Project: yyzreal/allennlp

    def test_cached_max_instances(self, lazy):
        data_file = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "imdb_corpus.jsonl"
        )

        # If we try reading with max instances, it shouldn't write to the cache.
        reader = TextClassificationJsonReader(
            cache_directory=self.cache_directory, lazy=lazy, max_instances=2
        )
        instances = list(reader.read(data_file))
        assert len(instances) == 2

        cache_file = reader._get_cache_location_for_file_path(str(data_file))
        assert not os.path.exists(cache_file)

        # Now reading again with no max_instances specified should create the cache.
        reader = TextClassificationJsonReader(cache_directory=self.cache_directory, lazy=lazy)
        instances = list(reader.read(data_file))
        assert len(instances) > 2
        assert os.path.exists(cache_file)

        # The second read should only return two instances, even though it's from the cache.
        reader = TextClassificationJsonReader(
            cache_directory=self.cache_directory, max_instances=2, lazy=lazy
        )
        instances = list(reader.read(data_file))
        assert len(instances) == 2

Example #4

0

Show file

 def test_max_instances(self, lazy):
     data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                  "text_classification_json" / "imdb_corpus.jsonl")
     reader = TextClassificationJsonReader(max_instances=2, lazy=lazy)
     instances = reader.read(data_file)
     instance_count = sum(1 for _ in instances)
     assert instance_count == 2

Example #5

0

Show file

File: text_classification_json_test.py Project: wgc20/GrailQA

    def test_read_from_file_ag_news_corpus_and_truncates_properly(self, lazy):
        reader = TextClassificationJsonReader(lazy=lazy, max_sequence_length=5)
        ag_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "ag_news_corpus.jsonl"
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {
            "tokens": ['Memphis', 'Rout', 'Still', 'Stings', 'for'],
            "label": "2"
        }
        instance2 = {
            "tokens": ['AP', '-', 'Eli', 'Manning', 'has'],
            "label": "2"
        }
        instance3 = {
            "tokens": ['A', 'conference', 'dedicated', 'to', 'online'],
            "label": "4"
        }

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
        assert fields["label"].label == instance3["label"]

Example #6

0

Show file

    def test_read_from_file_ag_news_corpus(self, lazy):
        reader = TextClassificationJsonReader(lazy=lazy)
        ag_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "ag_news_corpus.jsonl"
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ['Memphis', 'Rout', 'Still', 'Stings', 'for', 'No', '.', '14',
                                'Louisville', ';', 'Coach', 'Petrino', 'Vows', 'to', 'Have',
                                'Team', 'Better', 'Prepared', '.', 'NASHVILLE', ',', 'Tenn.',
                                'Nov', '3', ',', '2004', '-', 'Louisville', '#', '39;s', '30-point',
                                'loss', 'at', 'home', 'to', 'Memphis', 'last', 'season', 'is', 'still',
                                'a', 'painful', 'memory', 'for', 'the', 'Cardinals', '.'],
                     "label": "2"}
        instance2 = {"tokens": ['AP', '-', 'Eli', 'Manning', 'has', 'replaced', 'Kurt', 'Warner',
                                'as', 'the', 'New', 'York', 'Giants', "'", 'starting',
                                'quarterback', '.'],
                     "label": "2"}
        instance3 = {"tokens": ['A', 'conference', 'dedicated', 'to', 'online', 'journalism',
                                'explores', 'the', 'effect', 'blogs', 'have', 'on', 'news',
                                'reporting', '.', 'Some', 'say', 'they', 'draw', 'attention',
                                'to', 'under', '-', 'reported', 'stories', '.', 'Others',
                                'struggle', 'to', 'establish', 'the', 'credibility',
                                'enjoyed', 'by', 'professionals', '.'],
                     "label": "4"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
        assert fields["label"].label == instance3["label"]

Example #7

0

Show file

 def test_max_instances_with_multi_process_loader(self, num_workers):
     data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                  "text_classification_json" / "imdb_corpus.jsonl")
     reader = TextClassificationJsonReader(max_instances=2, lazy=True)
     instances = list(
         PyTorchDataLoader(reader.read(data_file),
                           collate_fn=lambda b: b[0],
                           num_workers=num_workers))
     assert len(instances) == 2

Example #8

0

Show file

 def test_read_creates_cache_file_when_not_present(self):
     data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                  "text_classification_json" / "imdb_corpus.jsonl")
     reader = TextClassificationJsonReader(
         cache_directory=self.cache_directory)
     cache_file = reader._get_cache_location_for_file_path(data_file)
     assert not os.path.exists(cache_file)
     reader.read(data_file)
     assert os.path.exists(cache_file)

Example #9

0

Show file

File: dataset_reader_test.py Project: yyzreal/allennlp

    def test_read_only_creates_cache_file_once(self):
        data_file = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "imdb_corpus.jsonl"
        )
        reader = TextClassificationJsonReader(cache_directory=self.cache_directory)
        cache_file = reader._get_cache_location_for_file_path(str(data_file))

        # The first read will create the cache.
        reader.read(data_file)
        assert os.path.exists(cache_file)
        with open(cache_file, "r") as in_file:
            cache_contents = in_file.read()
        # The second and all subsequent reads should _use_ the cache, not modify it.  I looked
        # into checking file modification times, but this test will probably be faster than the
        # granularity of `os.path.getmtime()` (which only returns values in seconds).
        reader.read(data_file)
        reader.read(data_file)
        reader.read(data_file)
        reader.read(data_file)
        with open(cache_file, "r") as in_file:
            final_cache_contents = in_file.read()
        assert cache_contents == final_cache_contents

Example #10

0

Show file

    def test_lazy_dataset_can_be_iterated_through_multiple_times(self):
        data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                     "text_classification_json" / "imdb_corpus.jsonl")
        reader = TextClassificationJsonReader(lazy=True)
        instances = reader.read(data_file)
        assert isinstance(instances, AllennlpLazyDataset)

        first_pass_instances = list(instances)
        assert len(first_pass_instances) > 2
        second_pass_instances = list(instances)
        assert first_pass_instances == second_pass_instances

Example #11

0

Show file

File: optuna_train_custom_trainer.py Project: himkt/optuna-allennlp

def prepare_data():
    reader = TextClassificationJsonReader(
        token_indexers={"tokens": SingleIdTokenIndexer()},
        tokenizer=WhitespaceTokenizer(),
    )
    train_dataset = reader.read("https://s3-us-west-2.amazonaws.com/allennlp/datasets/imdb/train.jsonl")  # NOQA
    valid_dataset = reader.read("https://s3-us-west-2.amazonaws.com/allennlp/datasets/imdb/dev.jsonl")  # NOQA
    vocab = Vocabulary.from_instances(train_dataset)
    train_dataset.index_with(vocab)
    valid_dataset.index_with(vocab)
    return train_dataset, valid_dataset, vocab

Example #12

0

Show file

    def test_read_from_file_ag_news_corpus_and_segments_sentences_properly(
            self, max_sequence_length):
        reader = TextClassificationJsonReader(
            segment_sentences=True, max_sequence_length=max_sequence_length)
        ag_path = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                   "text_classification_json" / "ag_news_corpus.jsonl")
        instances = list(reader.read(ag_path))

        splitter = SpacySentenceSplitter()
        spacy_tokenizer = get_spacy_model("en_core_web_sm",
                                          parse=False,
                                          ner=False)

        text1 = ("Memphis Rout Still Stings for No. 14 Louisville; Coach "
                 "Petrino Vows to Have Team Better Prepared. NASHVILLE, "
                 "Tenn. Nov 3, 2004 - Louisville #39;s 30-point loss "
                 "at home to Memphis last season is still a painful memory "
                 "for the Cardinals.")
        instance1 = {"text": text1, "label": "2"}
        text2 = ("AP - Eli Manning has replaced Kurt Warner as the New York"
                 " Giants' starting quarterback.")
        instance2 = {"text": text2, "label": "2"}
        text3 = ("A conference dedicated to online journalism explores the "
                 "effect blogs have on news reporting. Some say they draw "
                 "attention to under-reported stories. Others struggle to "
                 "establish the credibility enjoyed by professionals.")
        instance3 = {"text": text3, "label": "4"}

        for instance in [instance1, instance2, instance3]:
            sentences = splitter.split_sentences(instance["text"])
            tokenized_sentences: List[List[str]] = []
            for sentence in sentences:
                tokens = [token.text for token in spacy_tokenizer(sentence)]
                if max_sequence_length:
                    tokens = tokens[:max_sequence_length]
                tokenized_sentences.append(tokens)
            instance["tokens"] = tokenized_sentences

        assert len(instances) == 3
        fields = instances[0].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance3["tokens"]
        assert fields["label"].label == instance3["label"]

Example #13

0

Show file

    def test_caching_skipped_with_distributed_training(self, caplog,
                                                       monkeypatch, lazy):
        monkeypatch.setattr(common_util, "is_distributed", lambda: True)
        monkeypatch.setattr(dist, "get_rank", lambda: 0)
        monkeypatch.setattr(dist, "get_world_size", lambda: 1)

        data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                     "text_classification_json" / "imdb_corpus.jsonl")
        reader = TextClassificationJsonReader(
            lazy=lazy, cache_directory=self.cache_directory)
        cache_file = reader._get_cache_location_for_file_path(str(data_file))

        deque(reader.read(data_file), maxlen=1)
        assert not os.path.exists(cache_file)
        assert "Can't cache data instances when there are multiple processes" in caplog.text

Example #14

0

Show file

    def test_read_uses_existing_cache_file_when_present(self):
        data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                     "text_classification_json" / "imdb_corpus.jsonl")
        snli_copy_file = str(data_file) + ".copy"
        shutil.copyfile(data_file, snli_copy_file)
        reader = TextClassificationJsonReader(
            cache_directory=self.cache_directory)

        # The first read will create the cache.
        instances = reader.read(snli_copy_file)
        # Now we _remove_ the data file, to be sure we're reading from the cache.
        os.remove(snli_copy_file)
        cached_instances = reader.read(snli_copy_file)
        # We should get the same instances both times.
        assert len(instances) == len(cached_instances)
        for instance, cached_instance in zip(instances, cached_instances):
            assert instance.fields == cached_instance.fields

Example #15

0

Show file

    def test_caching_skipped_when_lock_not_acquired(self, caplog, lazy: bool):
        data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                     "text_classification_json" / "imdb_corpus.jsonl")
        reader = TextClassificationJsonReader(
            lazy=lazy, cache_directory=self.cache_directory)
        reader.CACHE_FILE_LOCK_TIMEOUT = 1
        cache_file = reader._get_cache_location_for_file_path(str(data_file))

        with FileLock(cache_file + ".lock"):
            # Right now we hold the lock on the cache, so the reader shouldn't
            # be able to write to it. It will wait for 1 second (because that's what
            # we set the timeout to be), and then just read the instances as normal.
            caplog.clear()
            instances = list(reader.read(data_file))
            assert "Failed to acquire lock" in caplog.text
            assert instances

        # We didn't write to the cache because we couldn't acquire the file lock.
        assert not os.path.exists(cache_file)

        # Now we'll write to the cache and then try the same thing again, this
        # time making sure that we can still successfully read without the cache
        # when the lock can't be acquired.
        deque(reader.read(data_file), maxlen=1)
        assert os.path.exists(cache_file)

        with FileLock(cache_file + ".lock"):
            # Right now we hold the lock on the cache, so the reader shouldn't
            # be able to write to it. It will wait for 1 second (because that's what
            # we set the timeout to be), and then just read the instances as normal.
            caplog.clear()
            instances = list(reader.read(data_file))
            assert "Failed to acquire lock" in caplog.text
            assert instances

Example #16

0

Show file

def test_preprocess_reader_text_to_instance() -> None:
    reader = PreprocessReader(
        TextClassificationJsonReader(),
        {"text": Lowercase()},
    )
    instance = reader.text_to_instance(text="THIS IS A TEST SENTENCE")
    text_field: TextField = instance["tokens"]
    desired_output = ["this", "is", "a", "test", "sentence"]
    assert [token.text for token in text_field.tokens] == desired_output

Example #17

0

Show file

File: smooth_gradient_test.py Project: ydwisroad/competitions

    def test_interpret_fails_when_embedding_layer_not_found(self):
        inputs = {"sentence": "It was the ending that I hated"}
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace([w for w in inputs["sentence"].split(" ")])
        model = FakeModelForTestingInterpret(vocab, max_tokens=len(inputs["sentence"].split(" ")))
        predictor = TextClassifierPredictor(model, TextClassificationJsonReader())

        interpreter = SmoothGradient(predictor)
        with raises(RuntimeError):
            interpreter.saliency_interpret_from_json(inputs)

Example #18

0

Show file

File: dataset_reader_test.py Project: yyzreal/allennlp

    def test_caching_with_lazy_reader_in_multi_process_loader(self):
        data_file = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "imdb_corpus.jsonl"
        )
        reader = TextClassificationJsonReader(lazy=True, cache_directory=self.cache_directory)
        deque(
            PyTorchDataLoader(reader.read(data_file), collate_fn=lambda b: b[0], num_workers=2),
            maxlen=0,
        )

        # We shouldn't write to the cache when the data is being loaded from multiple
        # processes.
        cache_file = reader._get_cache_location_for_file_path(str(data_file))
        assert not os.path.exists(cache_file)

        # But try again from the main process and we should see the cache file.
        instances = list(reader.read(data_file))
        assert instances
        assert os.path.exists(cache_file)

        # Reading again from a multi-process loader should read from the cache.
        new_instances = list(
            PyTorchDataLoader(reader.read(data_file), collate_fn=lambda b: b[0], num_workers=2)
        )
        assert len(instances) == len(new_instances)

Example #19

0

Show file

    def test_interpret_fails_when_embedding_layer_not_found(self):
        inputs = {"sentence": "I always write unit tests for my code."}
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(
            [w for w in inputs["sentence"].split(" ")])
        model = FakeModelForTestingInterpret(
            vocab, max_tokens=len(inputs["sentence"].split(" ")))
        predictor = TextClassifierPredictor(model,
                                            TextClassificationJsonReader())

        hotflipper = Hotflip(predictor)
        with raises(RuntimeError):
            hotflipper.initialize()

Example #20

0

Show file

    def test_caching_works_with_lazy_reading(self, caplog, lazy: bool):
        data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                     "text_classification_json" / "imdb_corpus.jsonl")
        snli_copy_file = str(data_file) + ".copy"
        shutil.copyfile(data_file, snli_copy_file)
        reader = TextClassificationJsonReader(
            lazy=lazy, cache_directory=self.cache_directory)
        cache_file = reader._get_cache_location_for_file_path(snli_copy_file)

        # The call to read() will give us an _iterator_.  We'll iterate over it multiple times,
        # and the caching behavior should change as we go.
        assert not os.path.exists(cache_file)
        instances = reader.read(snli_copy_file)

        # The first iteration will create the cache
        first_pass_instances = []
        for instance in instances:
            first_pass_instances.append(instance)
        assert "Caching instances to temp file" in " ".join(
            [rec.message for rec in caplog.records])
        assert os.path.exists(cache_file)

        # Now we _remove_ the data file, to be sure we're reading from the cache.
        os.remove(snli_copy_file)
        caplog.clear()
        instances = reader.read(snli_copy_file)
        second_pass_instances = []
        for instance in instances:
            second_pass_instances.append(instance)
        assert "Reading instances from cache" in " ".join(
            [rec.message for rec in caplog.records])

        # We should get the same instances both times.
        assert len(first_pass_instances) == len(second_pass_instances)
        for instance, cached_instance in zip(first_pass_instances,
                                             second_pass_instances):
            assert instance.fields == cached_instance.fields

        # And just to be super paranoid, in case the second pass somehow bypassed the cache
        # because of a bug that's hard to detect, we'll read the
        # instances from the cache with a non-lazy iterator and make sure they're the same.
        reader = TextClassificationJsonReader(
            lazy=False, cache_directory=self.cache_directory)
        cached_instances = reader.read(snli_copy_file)
        assert len(first_pass_instances) == len(cached_instances)
        for instance, cached_instance in zip(first_pass_instances,
                                             cached_instances):
            assert instance.fields == cached_instance.fields

Example #21

0

Show file

File: smooth_gradient_test.py Project: ydwisroad/competitions

    def test_interpret_works_with_custom_embedding_layer(self):
        inputs = {"sentence": "It was the ending that I hated"}
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace([w for w in inputs["sentence"].split(" ")])
        model = FakeModelForTestingInterpret(vocab, max_tokens=len(inputs["sentence"].split(" ")))
        predictor = FakePredictorForTestingInterpret(model, TextClassificationJsonReader())
        interpreter = SmoothGradient(predictor)

        interpretation = interpreter.saliency_interpret_from_json(inputs)

        assert interpretation is not None
        assert "instance_1" in interpretation
        assert "grad_input_1" in interpretation["instance_1"]
        grad_input_1 = interpretation["instance_1"]["grad_input_1"]
        assert len(grad_input_1) == 7  # 7 words in input

Example #22

0

Show file

File: evaluate.py Project: himkt/optuna-allennlp

def main(device, base_serialization_dir):
    storage = "sqlite:///" + os.path.join(base_serialization_dir, "optuna.db")
    study = load_study("optuna_allennlp", storage)
    best_trial = study.best_trial
    print(f"best_trial: {best_trial.number}")

    reader = TextClassificationJsonReader(
        token_indexers={"tokens": SingleIdTokenIndexer()},
        tokenizer=WhitespaceTokenizer(),
    )
    serialization_dir = os.path.join(base_serialization_dir, f"trial_{best_trial.number}")
    vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary"))
    data = reader.read("https://s3-us-west-2.amazonaws.com/allennlp/datasets/imdb/test.jsonl")
    data.index_with(vocab)

    hyperparams = best_trial.params
    hyperparams.pop("lr")
    model = create_model(vocab=vocab, **hyperparams)
    model.load_state_dict(torch.load(os.path.join(serialization_dir, "best.th")))

    if device >= 0:
        model.to(device)
    data_loader = DataLoader(data, batch_size=64, collate_fn=allennlp_collate)
    print(evaluate(model, data_loader, cuda_device=device))

Example #23

0

Show file

    def test_interpret_works_with_custom_embedding_layer(self):
        inputs = {"sentence": "I always write unit tests for my code"}
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(
            [w for w in inputs["sentence"].split(" ")])
        model = FakeModelForTestingInterpret(
            vocab, max_tokens=len(inputs["sentence"].split(" ")))
        predictor = FakePredictorForTestingInterpret(
            model, TextClassificationJsonReader())

        hotflipper = Hotflip(predictor)
        hotflipper.initialize()
        attack = hotflipper.attack_from_json(inputs, "tokens", "grad_input_1")
        assert attack is not None
        assert "final" in attack
        assert "original" in attack
        assert "outputs" in attack
        assert len(attack["final"][0]) == len(
            attack["original"])  # hotflip replaces words without removing

Example #24

0

Show file

    def test_cached_max_instances(self, lazy):
        data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" /
                     "text_classification_json" / "imdb_corpus.jsonl")

        # The first read will create the cache if it's not there already.
        reader = TextClassificationJsonReader(
            cache_directory=self.cache_directory, lazy=lazy)
        instances = reader.read(data_file)
        instance_count = sum(1 for _ in instances)
        assert instance_count > 2

        # The second read should only return two instances, even though it's from the cache.
        reader = TextClassificationJsonReader(
            cache_directory=self.cache_directory, max_instances=2, lazy=lazy)
        instances = reader.read(data_file)
        instance_count = sum(1 for _ in instances)
        assert instance_count == 2

Example #25

0

Show file

File: text_classification_json_test.py Project: shenyong123/GEML-MDG

    def test_read_from_file_ag_news_corpus(self, lazy):
        reader = TextClassificationJsonReader(lazy=lazy)
        ag_path = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "ag_news_corpus.jsonl"
        )
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {
            "tokens": [
                "Memphis",
                "Rout",
                "Still",
                "Stings",
                "for",
                "No",
                ".",
                "14",
                "Louisville",
                ";",
                "Coach",
                "Petrino",
                "Vows",
                "to",
                "Have",
                "Team",
                "Better",
                "Prepared",
                ".",
                "NASHVILLE",
                ",",
                "Tenn.",
                "Nov",
                "3",
                ",",
                "2004",
                "-",
                "Louisville",
                "#",
                "39;s",
                "30-point",
                "loss",
                "at",
                "home",
                "to",
                "Memphis",
                "last",
                "season",
                "is",
                "still",
                "a",
                "painful",
                "memory",
                "for",
                "the",
                "Cardinals",
                ".",
            ],
            "label": "2",
        }
        instance2 = {
            "tokens": [
                "AP",
                "-",
                "Eli",
                "Manning",
                "has",
                "replaced",
                "Kurt",
                "Warner",
                "as",
                "the",
                "New",
                "York",
                "Giants",
                "'",
                "starting",
                "quarterback",
                ".",
            ],
            "label": "2",
        }
        instance3 = {
            "tokens": [
                "A",
                "conference",
                "dedicated",
                "to",
                "online",
                "journalism",
                "explores",
                "the",
                "effect",
                "blogs",
                "have",
                "on",
                "news",
                "reporting",
                ".",
                "Some",
                "say",
                "they",
                "draw",
                "attention",
                "to",
                "under",
                "-",
                "reported",
                "stories",
                ".",
                "Others",
                "struggle",
                "to",
                "establish",
                "the",
                "credibility",
                "enjoyed",
                "by",
                "professionals",
                ".",
            ],
            "label": "4",
        }

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
        assert fields["label"].label == instance3["label"]

Example #26

0

Show file

File: text_classification.py Project: ichiroex/hello-allennlp

from allennlp.models import BasicClassifier
from allennlp.training.optimizers import AdamOptimizer
from allennlp.training import GradientDescentTrainer

from lib.tokenizer import MecabTokenizer

# 乱数シードの指定
random.seed(2)
torch.manual_seed(2)

# 自作トークナイザの呼び出し
tokenizer = MecabTokenizer()

# トークンインデクサ
token_indexer = SingleIdTokenIndexer()
reader = TextClassificationJsonReader(
    tokenizer=tokenizer, token_indexers=dict(tokens=token_indexer))
# データセットリーダ
train_dataset = reader.read('data/amazon_reviews/amazon_reviews_train.jsonl')
validation_dataset = reader.read(
    "data/amazon_reviews/amazon_reviews_validation.jsonl")

# 語彙の作成
vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
# データセットを処理する際に作成した語彙を使うように設定
train_dataset.index_with(vocab)
validation_dataset.index_with(vocab)

# 単語エンベディングの作成
embedding = Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=100)
# テキストの特徴ベクトルの作成
text_embedder = BasicTextFieldEmbedder({"tokens": embedding})