Ejemplo n.º 1
0
    def test_dry_run_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        dry_run_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3
Ejemplo n.º 2
0
    def test_dry_run_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / "existing"
        extended_serialization_dir = self.TEST_DIR / "extended"
        existing_vocab_path = existing_serialization_dir / "vocabulary"
        extended_vocab_path = extended_serialization_dir / "vocabulary"

        vocab = Vocabulary()
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens")
        vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens")
        vocab.add_token_to_namespace("N", namespace="labels")
        vocab.add_token_to_namespace("V", namespace="labels")
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params["vocabulary"] = {}
        self.params["vocabulary"]["type"] = "from_files"
        self.params["vocabulary"]["directory"] = existing_vocab_path
        dry_run_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == "@@UNKNOWN@@"
        assert tokens[1] == "some_weird_token_1"
        assert tokens[2] == "some_weird_token_2"
        assert len(tokens) == 3
Ejemplo n.º 3
0
    def test_dry_run_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        dry_run_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3
Ejemplo n.º 4
0
 def test_dry_run_doesnt_overwrite_vocab(self):
     vocab_path = self.TEST_DIR / "vocabulary"
     os.mkdir(vocab_path)
     # Put something in the vocab directory
     with open(vocab_path / "test.txt", "a+") as open_file:
         open_file.write("test")
     # It should raise error if vocab dir is non-empty
     with pytest.raises(ConfigurationError):
         dry_run_from_params(self.params, self.TEST_DIR)
Ejemplo n.º 5
0
 def test_dry_run_doesnt_overwrite_vocab(self):
     vocab_path = self.TEST_DIR / 'vocabulary'
     os.mkdir(vocab_path)
     # Put something in the vocab directory
     with open(vocab_path / "test.txt", "a+") as open_file:
         open_file.write("test")
     # It should raise error if vocab dir is non-empty
     with pytest.raises(ConfigurationError):
         dry_run_from_params(self.params, self.TEST_DIR)
Ejemplo n.º 6
0
    def test_dry_run_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / "existing"
        extended_serialization_dir = self.TEST_DIR / "extended"
        existing_vocab_path = existing_serialization_dir / "vocabulary"
        extended_vocab_path = extended_serialization_dir / "vocabulary"

        vocab = Vocabulary()
        vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens")
        vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens")
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params["vocabulary"] = {}
        self.params["vocabulary"]["type"] = "extend"
        self.params["vocabulary"]["directory"] = existing_vocab_path
        self.params["vocabulary"]["min_count"] = {"tokens": 3}
        dry_run_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == {
            "labels.txt", "non_padded_namespaces.txt", "tokens.txt"
        }

        with open(extended_vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == "@@UNKNOWN@@"
        assert tokens[1] == "some_weird_token_1"
        assert tokens[2] == "some_weird_token_2"

        tokens.sort()
        assert tokens == [
            ".",
            "@@UNKNOWN@@",
            "animals",
            "are",
            "some_weird_token_1",
            "some_weird_token_2",
        ]

        with open(extended_vocab_path / "labels.txt") as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ["N", "V"]
Ejemplo n.º 7
0
    def test_dry_run_makes_vocab(self):
        vocab_path = self.TEST_DIR / "vocabulary"

        dry_run_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {"labels.txt", "non_padded_namespaces.txt", "tokens.txt"}

        with open(vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == [".", "@@UNKNOWN@@", "animals", "are", "birds", "cats", "dogs", "snakes"]

        with open(vocab_path / "labels.txt") as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ["N", "V"]
Ejemplo n.º 8
0
    def test_dry_run_makes_vocab(self):
        vocab_path = self.TEST_DIR / 'vocabulary'

        dry_run_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs', 'snakes']

        with open(vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
Ejemplo n.º 9
0
    def test_dry_run_makes_vocab(self):
        vocab_path = self.TEST_DIR / 'vocabulary'

        dry_run_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs', 'snakes']

        with open(vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
Ejemplo n.º 10
0
    def test_dry_run_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / u'existing'
        extended_serialization_dir = self.TEST_DIR / u'extended'
        existing_vocab_path = existing_serialization_dir / u'vocabulary'
        extended_vocab_path = extended_serialization_dir / u'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace(u'some_weird_token_1',
                                     namespace=u'tokens')
        vocab.add_token_to_namespace(u'some_weird_token_2',
                                     namespace=u'tokens')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params[u'vocabulary'] = {}
        self.params[u'vocabulary'][u'directory_path'] = existing_vocab_path
        self.params[u'vocabulary'][u'extend'] = True
        self.params[u'vocabulary'][u'min_count'] = {u"tokens": 3}
        dry_run_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == set(
            [u'labels.txt', u'non_padded_namespaces.txt', u'tokens.txt'])

        with open(extended_vocab_path / u'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == u'@@UNKNOWN@@'
        assert tokens[1] == u'some_weird_token_1'
        assert tokens[2] == u'some_weird_token_2'

        tokens.sort()
        assert tokens == [
            u'.', u'@@UNKNOWN@@', u'animals', u'are', u'some_weird_token_1',
            u'some_weird_token_2'
        ]

        with open(extended_vocab_path / u'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == [u'N', u'V']
Ejemplo n.º 11
0
    def test_dry_run_doesnt_overwrite_vocab(self):
        vocab_path = self.TEST_DIR / 'pre-defined-vocab'
        os.mkdir(vocab_path)
        # Put something in the vocab directory
        with open(vocab_path / "test.txt", "a+") as open_file:
            open_file.write("test")

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = vocab_path

        dry_run_from_params(self.params, self.TEST_DIR)

        # Shouldn't have been overwritten.
        predefined_vocab_files = os.listdir(vocab_path)
        assert set(predefined_vocab_files) == {'test.txt'}
        # But we should have written the created vocab to serialisation_dir/vocab:
        new_vocab_files = os.listdir(self.TEST_DIR / 'vocabulary')
        assert set(new_vocab_files) == {
            'tokens.txt', 'non_padded_namespaces.txt', 'labels.txt'
        }
Ejemplo n.º 12
0
    def test_dry_run_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = True
        self.params['vocabulary']['min_count'] = {"tokens" : 3}
        dry_run_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are',
                          'some_weird_token_1', 'some_weird_token_2']

        with open(extended_vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
Ejemplo n.º 13
0
 def test_dry_run_without_vocabulary_key(self):
     dry_run_from_params(self.params, self.TEST_DIR)
Ejemplo n.º 14
0
 def test_dry_run_without_vocabulary_key(self):
     dry_run_from_params(self.params, self.TEST_DIR)