def test_make_vocab_without_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace('N', namespace='labels') vocab.add_token_to_namespace('V', namespace='labels') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = False make_vocab_from_params(self.params, extended_serialization_dir) with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' assert len(tokens) == 3
def test_make_vocab_makes_vocab(self): vocab_path = self.TEST_DIR / 'vocabulary' self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = vocab_path make_vocab_from_params(self.params) vocab_files = os.listdir(vocab_path) assert set(vocab_files) == { 'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt' } with open(vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] tokens.sort() assert tokens == [ '.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs', 'snakes' ] with open(vocab_path / 'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == ['N', 'V']
def cache_vocab(params: Params, vocab_config_path: str = None): """ Caches the vocabulary given in the Params to the filesystem. Useful for large datasets that are run repeatedly. :param params: the AllenNLP Params :param vocab_config_path: an optional config path for constructing the vocab """ if "vocabulary" not in params or "directory_path" not in params["vocabulary"]: return vocab_path = params["vocabulary"]["directory_path"] if os.path.exists(vocab_path): if os.listdir(vocab_path): return # Remove empty vocabulary directory to make AllenNLP happy try: os.rmdir(vocab_path) except OSError: pass vocab_config_path = vocab_config_path if vocab_config_path else VOCAB_CONFIG_PATH params = merge_configs([params, Params.from_file(vocab_config_path)]) params["vocabulary"].pop("directory_path", None) make_vocab_from_params(params, os.path.split(vocab_path)[0])
def test_make_vocab_doesnt_overwrite_vocab(self): vocab_path = self.TEST_DIR / 'vocabulary' os.mkdir(vocab_path) # Put something in the vocab directory with open(vocab_path / "test.txt", "a+") as open_file: open_file.write("test") # It should raise error if vocab dir is non-empty with pytest.raises(ConfigurationError): make_vocab_from_params(self.params, self.TEST_DIR)
def test_make_vocab_with_extension(self): existing_serialization_dir = self.TEST_DIR / "existing" extended_serialization_dir = self.TEST_DIR / "extended" existing_vocab_path = existing_serialization_dir / "vocabulary" extended_vocab_path = extended_serialization_dir / "vocabulary" vocab = Vocabulary() vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens") vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens") os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params["vocabulary"] = {} self.params["vocabulary"]["directory_path"] = existing_vocab_path self.params["vocabulary"]["extend"] = True self.params["vocabulary"]["min_count"] = {"tokens": 3} make_vocab_from_params(self.params, extended_serialization_dir) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == { "labels.txt", "non_padded_namespaces.txt", "tokens.txt" } with open(extended_vocab_path / "tokens.txt") as f: tokens = [line.strip() for line in f] assert tokens[0] == "@@UNKNOWN@@" assert tokens[1] == "some_weird_token_1" assert tokens[2] == "some_weird_token_2" tokens.sort() assert tokens == [ ".", "@@UNKNOWN@@", "animals", "are", "some_weird_token_1", "some_weird_token_2", ] with open(extended_vocab_path / "labels.txt") as f: labels = [line.strip() for line in f] labels.sort() assert labels == ["N", "V"]
def test_make_vocab_makes_vocab(self): vocab_path = self.TEST_DIR / 'vocabulary' make_vocab_from_params(self.params, self.TEST_DIR) vocab_files = os.listdir(vocab_path) assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'} with open(vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] tokens.sort() assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs', 'snakes'] with open(vocab_path / 'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == ['N', 'V']
def test_make_vocab_with_extension(self): existing_serialization_dir = self.TEST_DIR / u'existing' extended_serialization_dir = self.TEST_DIR / u'extended' existing_vocab_path = existing_serialization_dir / u'vocabulary' extended_vocab_path = extended_serialization_dir / u'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace(u'some_weird_token_1', namespace=u'tokens') vocab.add_token_to_namespace(u'some_weird_token_2', namespace=u'tokens') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params[u'vocabulary'] = {} self.params[u'vocabulary'][u'directory_path'] = existing_vocab_path self.params[u'vocabulary'][u'extend'] = True self.params[u'vocabulary'][u'min_count'] = {u"tokens": 3} make_vocab_from_params(self.params, extended_serialization_dir) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == set( [u'labels.txt', u'non_padded_namespaces.txt', u'tokens.txt']) with open(extended_vocab_path / u'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == u'@@UNKNOWN@@' assert tokens[1] == u'some_weird_token_1' assert tokens[2] == u'some_weird_token_2' tokens.sort() assert tokens == [ u'.', u'@@UNKNOWN@@', u'animals', u'are', u'some_weird_token_1', u'some_weird_token_2' ] with open(extended_vocab_path / u'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == [u'N', u'V']
def test_make_vocab_makes_vocab(self): vocab_path = os.path.join(self.TEST_DIR, 'vocabulary') self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = vocab_path make_vocab_from_params(self.params) vocab_files = os.listdir(vocab_path) assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'} with open(os.path.join(vocab_path, 'tokens.txt')) as f: tokens = [line.strip() for line in f] tokens.sort() assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs', 'snakes'] with open(os.path.join(vocab_path, 'labels.txt')) as f: labels = [line.strip() for line in f] labels.sort() assert labels == ['N', 'V']
def test_make_vocab_makes_vocab_with_config(self): vocab_path = self.TEST_DIR / 'vocabulary' self.params['vocabulary'] = {} self.params['vocabulary']['min_count'] = {"tokens" : 3} make_vocab_from_params(self.params, self.TEST_DIR) vocab_files = os.listdir(vocab_path) assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'} with open(vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] tokens.sort() assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are'] with open(vocab_path / 'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == ['N', 'V']
def test_make_vocab_makes_vocab(self): vocab_path = self.TEST_DIR / u'vocabulary' make_vocab_from_params(self.params, self.TEST_DIR) vocab_files = os.listdir(vocab_path) assert set(vocab_files) == set( [u'labels.txt', u'non_padded_namespaces.txt', u'tokens.txt']) with open(vocab_path / u'tokens.txt') as f: tokens = [line.strip() for line in f] tokens.sort() assert tokens == [ u'.', u'@@UNKNOWN@@', u'animals', u'are', u'birds', u'cats', u'dogs', u'snakes' ] with open(vocab_path / u'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == [u'N', u'V']
def test_make_vocab_with_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = True self.params['vocabulary']['min_count'] = {"tokens" : 3} make_vocab_from_params(self.params, extended_serialization_dir) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'} with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' tokens.sort() assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'some_weird_token_1', 'some_weird_token_2'] with open(extended_vocab_path / 'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == ['N', 'V']
def test_make_vocab_makes_vocab_with_config(self): vocab_path = self.TEST_DIR / "vocabulary" self.params["vocabulary"] = {} self.params["vocabulary"]["min_count"] = {"tokens": 3} make_vocab_from_params(self.params, self.TEST_DIR) vocab_files = os.listdir(vocab_path) assert set(vocab_files) == { "labels.txt", "non_padded_namespaces.txt", "tokens.txt" } with open(vocab_path / "tokens.txt") as f: tokens = [line.strip() for line in f] tokens.sort() assert tokens == [".", "@@UNKNOWN@@", "animals", "are"] with open(vocab_path / "labels.txt") as f: labels = [line.strip() for line in f] labels.sort() assert labels == ["N", "V"]
def test_make_vocab_makes_vocab(self): vocab_path = self.TEST_DIR / "vocabulary" make_vocab_from_params(self.params, self.TEST_DIR) vocab_files = os.listdir(vocab_path) assert set(vocab_files) == { "labels.txt", "non_padded_namespaces.txt", "tokens.txt" } with open(vocab_path / "tokens.txt") as f: tokens = [line.strip() for line in f] tokens.sort() assert tokens == [ ".", "@@UNKNOWN@@", "animals", "are", "birds", "cats", "dogs", "snakes" ] with open(vocab_path / "labels.txt") as f: labels = [line.strip() for line in f] labels.sort() assert labels == ["N", "V"]
def test_make_vocab_fails_without_vocabulary_key(self): with pytest.raises(ConfigurationError): make_vocab_from_params(self.params)
def test_make_vocab_succeeds_without_vocabulary_key(self): make_vocab_from_params(self.params, self.TEST_DIR)
def build_vocab(): from allennlp.commands.make_vocab import make_vocab_from_params jsonnet_file = os.path.join(root, 'configs/baseline_bert.jsonnet') params = Params.from_file(jsonnet_file) make_vocab_from_params(params, '/datadrive/bert_vocab')
"Specify a list of treebanks to use; leave blank to default to all treebanks available" ) parser.add_argument("--params_file", default=None, type=str, help="The path to the vocab params") args = parser.parse_args() import_submodules("udify") params_file = util.VOCAB_CONFIG_PATH if not args.params_file else args.params_file treebanks = sorted( util.get_ud_treebank_files(args.dataset_dir, args.treebanks).items()) for treebank, (train_file, dev_file, test_file) in treebanks: logger.info(f"Creating vocabulary for treebank {treebank}") if not train_file: logger.info(f"No training data for {treebank}, skipping") continue overrides = json.dumps({ "train_data_path": train_file, "validation_data_path": dev_file, "test_data_path": test_file }) params = Params.from_file(params_file, overrides) output_file = os.path.join(args.output_dir, treebank) make_vocab_from_params(params, output_file)