Ejemplo n.º 1
0
    def test_make_vocab_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        make_vocab_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3
Ejemplo n.º 2
0
    def test_make_vocab_makes_vocab(self):
        vocab_path = self.TEST_DIR / 'vocabulary'

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = vocab_path

        make_vocab_from_params(self.params)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {
            'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'
        }

        with open(vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == [
            '.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs',
            'snakes'
        ]

        with open(vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
Ejemplo n.º 3
0
    def test_make_vocab_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        make_vocab_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3
Ejemplo n.º 4
0
def cache_vocab(params: Params, vocab_config_path: str = None):
    """
    Caches the vocabulary given in the Params to the filesystem. Useful for large datasets that are run repeatedly.
    :param params: the AllenNLP Params
    :param vocab_config_path: an optional config path for constructing the vocab
    """
    if "vocabulary" not in params or "directory_path" not in params["vocabulary"]:
        return

    vocab_path = params["vocabulary"]["directory_path"]

    if os.path.exists(vocab_path):
        if os.listdir(vocab_path):
            return

        # Remove empty vocabulary directory to make AllenNLP happy
        try:
            os.rmdir(vocab_path)
        except OSError:
            pass

    vocab_config_path = vocab_config_path if vocab_config_path else VOCAB_CONFIG_PATH

    params = merge_configs([params, Params.from_file(vocab_config_path)])
    params["vocabulary"].pop("directory_path", None)
    make_vocab_from_params(params, os.path.split(vocab_path)[0])
Ejemplo n.º 5
0
 def test_make_vocab_doesnt_overwrite_vocab(self):
     vocab_path = self.TEST_DIR / 'vocabulary'
     os.mkdir(vocab_path)
     # Put something in the vocab directory
     with open(vocab_path / "test.txt", "a+") as open_file:
         open_file.write("test")
     # It should raise error if vocab dir is non-empty
     with pytest.raises(ConfigurationError):
         make_vocab_from_params(self.params, self.TEST_DIR)
Ejemplo n.º 6
0
 def test_make_vocab_doesnt_overwrite_vocab(self):
     vocab_path = self.TEST_DIR / 'vocabulary'
     os.mkdir(vocab_path)
     # Put something in the vocab directory
     with open(vocab_path / "test.txt", "a+") as open_file:
         open_file.write("test")
     # It should raise error if vocab dir is non-empty
     with pytest.raises(ConfigurationError):
         make_vocab_from_params(self.params, self.TEST_DIR)
Ejemplo n.º 7
0
    def test_make_vocab_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / "existing"
        extended_serialization_dir = self.TEST_DIR / "extended"
        existing_vocab_path = existing_serialization_dir / "vocabulary"
        extended_vocab_path = extended_serialization_dir / "vocabulary"

        vocab = Vocabulary()
        vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens")
        vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens")
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params["vocabulary"] = {}
        self.params["vocabulary"]["directory_path"] = existing_vocab_path
        self.params["vocabulary"]["extend"] = True
        self.params["vocabulary"]["min_count"] = {"tokens": 3}
        make_vocab_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == {
            "labels.txt", "non_padded_namespaces.txt", "tokens.txt"
        }

        with open(extended_vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == "@@UNKNOWN@@"
        assert tokens[1] == "some_weird_token_1"
        assert tokens[2] == "some_weird_token_2"

        tokens.sort()
        assert tokens == [
            ".",
            "@@UNKNOWN@@",
            "animals",
            "are",
            "some_weird_token_1",
            "some_weird_token_2",
        ]

        with open(extended_vocab_path / "labels.txt") as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ["N", "V"]
Ejemplo n.º 8
0
    def test_make_vocab_makes_vocab(self):
        vocab_path = self.TEST_DIR / 'vocabulary'

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs', 'snakes']

        with open(vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
Ejemplo n.º 9
0
    def test_make_vocab_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / u'existing'
        extended_serialization_dir = self.TEST_DIR / u'extended'
        existing_vocab_path = existing_serialization_dir / u'vocabulary'
        extended_vocab_path = extended_serialization_dir / u'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace(u'some_weird_token_1',
                                     namespace=u'tokens')
        vocab.add_token_to_namespace(u'some_weird_token_2',
                                     namespace=u'tokens')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params[u'vocabulary'] = {}
        self.params[u'vocabulary'][u'directory_path'] = existing_vocab_path
        self.params[u'vocabulary'][u'extend'] = True
        self.params[u'vocabulary'][u'min_count'] = {u"tokens": 3}
        make_vocab_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == set(
            [u'labels.txt', u'non_padded_namespaces.txt', u'tokens.txt'])

        with open(extended_vocab_path / u'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == u'@@UNKNOWN@@'
        assert tokens[1] == u'some_weird_token_1'
        assert tokens[2] == u'some_weird_token_2'

        tokens.sort()
        assert tokens == [
            u'.', u'@@UNKNOWN@@', u'animals', u'are', u'some_weird_token_1',
            u'some_weird_token_2'
        ]

        with open(extended_vocab_path / u'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == [u'N', u'V']
Ejemplo n.º 10
0
    def test_make_vocab_makes_vocab(self):
        vocab_path = os.path.join(self.TEST_DIR, 'vocabulary')

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = vocab_path

        make_vocab_from_params(self.params)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(os.path.join(vocab_path, 'tokens.txt')) as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs', 'snakes']

        with open(os.path.join(vocab_path, 'labels.txt')) as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
Ejemplo n.º 11
0
    def test_make_vocab_makes_vocab_with_config(self):
        vocab_path = self.TEST_DIR / 'vocabulary'

        self.params['vocabulary'] = {}
        self.params['vocabulary']['min_count'] = {"tokens" : 3}

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are']

        with open(vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
Ejemplo n.º 12
0
    def test_make_vocab_makes_vocab_with_config(self):
        vocab_path = self.TEST_DIR / 'vocabulary'

        self.params['vocabulary'] = {}
        self.params['vocabulary']['min_count'] = {"tokens" : 3}

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are']

        with open(vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
Ejemplo n.º 13
0
    def test_make_vocab_makes_vocab(self):
        vocab_path = self.TEST_DIR / u'vocabulary'

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == set(
            [u'labels.txt', u'non_padded_namespaces.txt', u'tokens.txt'])

        with open(vocab_path / u'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == [
            u'.', u'@@UNKNOWN@@', u'animals', u'are', u'birds', u'cats',
            u'dogs', u'snakes'
        ]

        with open(vocab_path / u'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == [u'N', u'V']
Ejemplo n.º 14
0
    def test_make_vocab_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = True
        self.params['vocabulary']['min_count'] = {"tokens" : 3}
        make_vocab_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are',
                          'some_weird_token_1', 'some_weird_token_2']

        with open(extended_vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
Ejemplo n.º 15
0
    def test_make_vocab_makes_vocab_with_config(self):
        vocab_path = self.TEST_DIR / "vocabulary"

        self.params["vocabulary"] = {}
        self.params["vocabulary"]["min_count"] = {"tokens": 3}

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {
            "labels.txt", "non_padded_namespaces.txt", "tokens.txt"
        }

        with open(vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == [".", "@@UNKNOWN@@", "animals", "are"]

        with open(vocab_path / "labels.txt") as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ["N", "V"]
Ejemplo n.º 16
0
    def test_make_vocab_makes_vocab(self):
        vocab_path = self.TEST_DIR / "vocabulary"

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {
            "labels.txt", "non_padded_namespaces.txt", "tokens.txt"
        }

        with open(vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == [
            ".", "@@UNKNOWN@@", "animals", "are", "birds", "cats", "dogs",
            "snakes"
        ]

        with open(vocab_path / "labels.txt") as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ["N", "V"]
Ejemplo n.º 17
0
 def test_make_vocab_fails_without_vocabulary_key(self):
     with pytest.raises(ConfigurationError):
         make_vocab_from_params(self.params)
Ejemplo n.º 18
0
 def test_make_vocab_succeeds_without_vocabulary_key(self):
     make_vocab_from_params(self.params, self.TEST_DIR)
Ejemplo n.º 19
0
 def test_make_vocab_succeeds_without_vocabulary_key(self):
     make_vocab_from_params(self.params, self.TEST_DIR)
Ejemplo n.º 20
0
def build_vocab():
    from allennlp.commands.make_vocab import make_vocab_from_params

    jsonnet_file = os.path.join(root, 'configs/baseline_bert.jsonnet')
    params = Params.from_file(jsonnet_file)
    make_vocab_from_params(params, '/datadrive/bert_vocab')
Ejemplo n.º 21
0
 def test_make_vocab_fails_without_vocabulary_key(self):
     with pytest.raises(ConfigurationError):
         make_vocab_from_params(self.params)
Ejemplo n.º 22
0
    "Specify a list of treebanks to use; leave blank to default to all treebanks available"
)
parser.add_argument("--params_file",
                    default=None,
                    type=str,
                    help="The path to the vocab params")
args = parser.parse_args()

import_submodules("udify")

params_file = util.VOCAB_CONFIG_PATH if not args.params_file else args.params_file

treebanks = sorted(
    util.get_ud_treebank_files(args.dataset_dir, args.treebanks).items())
for treebank, (train_file, dev_file, test_file) in treebanks:
    logger.info(f"Creating vocabulary for treebank {treebank}")

    if not train_file:
        logger.info(f"No training data for {treebank}, skipping")
        continue

    overrides = json.dumps({
        "train_data_path": train_file,
        "validation_data_path": dev_file,
        "test_data_path": test_file
    })
    params = Params.from_file(params_file, overrides)
    output_file = os.path.join(args.output_dir, treebank)

    make_vocab_from_params(params, output_file)