def test_dry_run_without_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') vocab.add_token_to_namespace('N', namespace='labels') vocab.add_token_to_namespace('V', namespace='labels') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = False dry_run_from_params(self.params, extended_serialization_dir) with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' assert len(tokens) == 3
def test_dry_run_without_extension(self): existing_serialization_dir = self.TEST_DIR / "existing" extended_serialization_dir = self.TEST_DIR / "extended" existing_vocab_path = existing_serialization_dir / "vocabulary" extended_vocab_path = extended_serialization_dir / "vocabulary" vocab = Vocabulary() # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens") vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens") vocab.add_token_to_namespace("N", namespace="labels") vocab.add_token_to_namespace("V", namespace="labels") os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params["vocabulary"] = {} self.params["vocabulary"]["type"] = "from_files" self.params["vocabulary"]["directory"] = existing_vocab_path dry_run_from_params(self.params, extended_serialization_dir) with open(extended_vocab_path / "tokens.txt") as f: tokens = [line.strip() for line in f] assert tokens[0] == "@@UNKNOWN@@" assert tokens[1] == "some_weird_token_1" assert tokens[2] == "some_weird_token_2" assert len(tokens) == 3
def test_dry_run_without_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') vocab.add_token_to_namespace('N', namespace='labels') vocab.add_token_to_namespace('V', namespace='labels') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = False dry_run_from_params(self.params, extended_serialization_dir) with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' assert len(tokens) == 3
def test_dry_run_doesnt_overwrite_vocab(self): vocab_path = self.TEST_DIR / "vocabulary" os.mkdir(vocab_path) # Put something in the vocab directory with open(vocab_path / "test.txt", "a+") as open_file: open_file.write("test") # It should raise error if vocab dir is non-empty with pytest.raises(ConfigurationError): dry_run_from_params(self.params, self.TEST_DIR)
def test_dry_run_doesnt_overwrite_vocab(self): vocab_path = self.TEST_DIR / 'vocabulary' os.mkdir(vocab_path) # Put something in the vocab directory with open(vocab_path / "test.txt", "a+") as open_file: open_file.write("test") # It should raise error if vocab dir is non-empty with pytest.raises(ConfigurationError): dry_run_from_params(self.params, self.TEST_DIR)
def test_dry_run_with_extension(self): existing_serialization_dir = self.TEST_DIR / "existing" extended_serialization_dir = self.TEST_DIR / "extended" existing_vocab_path = existing_serialization_dir / "vocabulary" extended_vocab_path = extended_serialization_dir / "vocabulary" vocab = Vocabulary() vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens") vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens") os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params["vocabulary"] = {} self.params["vocabulary"]["type"] = "extend" self.params["vocabulary"]["directory"] = existing_vocab_path self.params["vocabulary"]["min_count"] = {"tokens": 3} dry_run_from_params(self.params, extended_serialization_dir) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == { "labels.txt", "non_padded_namespaces.txt", "tokens.txt" } with open(extended_vocab_path / "tokens.txt") as f: tokens = [line.strip() for line in f] assert tokens[0] == "@@UNKNOWN@@" assert tokens[1] == "some_weird_token_1" assert tokens[2] == "some_weird_token_2" tokens.sort() assert tokens == [ ".", "@@UNKNOWN@@", "animals", "are", "some_weird_token_1", "some_weird_token_2", ] with open(extended_vocab_path / "labels.txt") as f: labels = [line.strip() for line in f] labels.sort() assert labels == ["N", "V"]
def test_dry_run_makes_vocab(self): vocab_path = self.TEST_DIR / "vocabulary" dry_run_from_params(self.params, self.TEST_DIR) vocab_files = os.listdir(vocab_path) assert set(vocab_files) == {"labels.txt", "non_padded_namespaces.txt", "tokens.txt"} with open(vocab_path / "tokens.txt") as f: tokens = [line.strip() for line in f] tokens.sort() assert tokens == [".", "@@UNKNOWN@@", "animals", "are", "birds", "cats", "dogs", "snakes"] with open(vocab_path / "labels.txt") as f: labels = [line.strip() for line in f] labels.sort() assert labels == ["N", "V"]
def test_dry_run_makes_vocab(self): vocab_path = self.TEST_DIR / 'vocabulary' dry_run_from_params(self.params, self.TEST_DIR) vocab_files = os.listdir(vocab_path) assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'} with open(vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] tokens.sort() assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs', 'snakes'] with open(vocab_path / 'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == ['N', 'V']
def test_dry_run_makes_vocab(self): vocab_path = self.TEST_DIR / 'vocabulary' dry_run_from_params(self.params, self.TEST_DIR) vocab_files = os.listdir(vocab_path) assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'} with open(vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] tokens.sort() assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs', 'snakes'] with open(vocab_path / 'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == ['N', 'V']
def test_dry_run_with_extension(self): existing_serialization_dir = self.TEST_DIR / u'existing' extended_serialization_dir = self.TEST_DIR / u'extended' existing_vocab_path = existing_serialization_dir / u'vocabulary' extended_vocab_path = extended_serialization_dir / u'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace(u'some_weird_token_1', namespace=u'tokens') vocab.add_token_to_namespace(u'some_weird_token_2', namespace=u'tokens') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params[u'vocabulary'] = {} self.params[u'vocabulary'][u'directory_path'] = existing_vocab_path self.params[u'vocabulary'][u'extend'] = True self.params[u'vocabulary'][u'min_count'] = {u"tokens": 3} dry_run_from_params(self.params, extended_serialization_dir) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == set( [u'labels.txt', u'non_padded_namespaces.txt', u'tokens.txt']) with open(extended_vocab_path / u'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == u'@@UNKNOWN@@' assert tokens[1] == u'some_weird_token_1' assert tokens[2] == u'some_weird_token_2' tokens.sort() assert tokens == [ u'.', u'@@UNKNOWN@@', u'animals', u'are', u'some_weird_token_1', u'some_weird_token_2' ] with open(extended_vocab_path / u'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == [u'N', u'V']
def test_dry_run_doesnt_overwrite_vocab(self): vocab_path = self.TEST_DIR / 'pre-defined-vocab' os.mkdir(vocab_path) # Put something in the vocab directory with open(vocab_path / "test.txt", "a+") as open_file: open_file.write("test") self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = vocab_path dry_run_from_params(self.params, self.TEST_DIR) # Shouldn't have been overwritten. predefined_vocab_files = os.listdir(vocab_path) assert set(predefined_vocab_files) == {'test.txt'} # But we should have written the created vocab to serialisation_dir/vocab: new_vocab_files = os.listdir(self.TEST_DIR / 'vocabulary') assert set(new_vocab_files) == { 'tokens.txt', 'non_padded_namespaces.txt', 'labels.txt' }
def test_dry_run_with_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = True self.params['vocabulary']['min_count'] = {"tokens" : 3} dry_run_from_params(self.params, extended_serialization_dir) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'} with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' tokens.sort() assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'some_weird_token_1', 'some_weird_token_2'] with open(extended_vocab_path / 'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == ['N', 'V']
def test_dry_run_without_vocabulary_key(self): dry_run_from_params(self.params, self.TEST_DIR)
def test_dry_run_without_vocabulary_key(self): dry_run_from_params(self.params, self.TEST_DIR)