def test_out_of_vocabulary(self): vocabulary = Vocabulary(["a a a b b OOV c"], WordTokenizer(True), out_of_vocabulary="OOV") self.assertEqual(1, vocabulary.index("OOV")) self.assertEqual(1, vocabulary.index("z")) self.assertEqual(2, vocabulary.index("a")) self.assertEqual(3, vocabulary.index("b")) self.assertEqual(4, vocabulary.index("c"))
def test_vocabulary_serialization(self): vocabulary_name = os.path.join(self.directory, "vocabulary.pkl") vocabulary = Vocabulary(["the quick brown fox jumped over the lazy dog"], WordTokenizer(True)) self._serialize(vocabulary_name, vocabulary) self.assertTrue(os.path.isfile(vocabulary_name)) deserialized_vocabulary = self._deserialize(vocabulary_name) self.assertEqual(vocabulary, deserialized_vocabulary) s = "The quick black fox" np.testing.assert_equal(vocabulary.index_string(s), deserialized_vocabulary.index_string(s))
def test_vocabulary_serialization(self): vocabulary_name = os.path.join(self.directory, "vocabulary.pkl") vocabulary = Vocabulary( ["the quick brown fox jumped over the lazy dog"], WordTokenizer(True)) self._serialize(vocabulary_name, vocabulary) self.assertTrue(os.path.isfile(vocabulary_name)) deserialized_vocabulary = self._deserialize(vocabulary_name) self.assertEqual(vocabulary, deserialized_vocabulary) s = "The quick black fox" np.testing.assert_equal(vocabulary.index_string(s), deserialized_vocabulary.index_string(s))
def test_invalid_partition(self): self.assertRaises( ValueError, PartitionedData.from_text, { "train": ["red red red", "blue blue green"], "validate": ["red blue blue orange"], "test": ["green green red black"] }, ["bogus"], Vocabulary.factory(WordTokenizer(True)))
def create_partitioned_data(): return PartitionedData.from_text( { "train": ["blue blue green", "red red red"], "test": ["green green red black"], "validate": ["red blue blue orange"] }, ["train"], Vocabulary.factory(WordTokenizer(True)))
def test_limited_vocabulary(self): v = Vocabulary(["to be or not to be"], WordTokenizer(True), max_vocabulary=2) self.assertEqual({"to", "be"}, set(v.type_to_index.keys())) self.assertEqual(2, len(v)) v = Vocabulary(["hamlet hamlet hamlet to be or not to be"], WordTokenizer(True), min_frequency=2) self.assertEqual({"to", "be", "hamlet"}, set(v.type_to_index.keys())) self.assertEqual(3, len(v)) v = Vocabulary(["hamlet hamlet hamlet to be or not to be"], WordTokenizer(True), max_vocabulary=2, min_frequency=2) self.assertEqual({"be", "hamlet"}, set(v.type_to_index.keys())) self.assertEqual(2, len(v))
def test_full_vocabulary(self): v = Vocabulary(["the quick brown fox jumped over the lazy dog"], WordTokenizer(True)) self.assertEqual( "Vocabulary, size 8: None:1 the:2 brown:3 dog:4 fox:5 ...", str(v)) self.assertEqual( {"the", "quick", "brown", "fox", "jumped", "over", "lazy", "dog"}, set(v.type_to_index.keys())) self.assertEqual(8, len(v))
def create_data_set(args): if args.vocabulary_partitions is None: args.vocabulary_partitions = args.partitions.keys() tokenizer = {"whitespace": WhitespaceTokenizer, "word": WordTokenizer, "character": CharacterTokenizer}[args.tokenizer](args.case_normalized) factory = Vocabulary.factory(tokenizer, min_frequency=args.min_frequency, max_vocabulary=args.max_vocabulary, out_of_vocabulary=args.out_of_vocabulary) partitions = {} for partition, filenames in args.partitions.items(): partitions[partition] = [open(filename).read() for filename in filenames] partitioned_data = PartitionedData.from_text(partitions, args.vocabulary_partitions, factory) partitioned_data.serialize(args.directory) logger.info("Created %s in %s" % (partitioned_data, args.directory))
def create_data_set(args): if args.vocabulary_partitions is None: args.vocabulary_partitions = args.partitions.keys() tokenizer = { "whitespace": WhitespaceTokenizer, "word": WordTokenizer, "character": CharacterTokenizer }[args.tokenizer](args.case_normalized) factory = Vocabulary.factory(tokenizer, min_frequency=args.min_frequency, max_vocabulary=args.max_vocabulary, out_of_vocabulary=args.out_of_vocabulary) partitions = {} for partition, filenames in args.partitions.items(): partitions[partition] = [ open(filename).read() for filename in filenames ] partitioned_data = PartitionedData.from_text(partitions, args.vocabulary_partitions, factory) partitioned_data.serialize(args.directory) logger.info("Created %s in %s" % (partitioned_data, args.directory))
def test_invalid_index(self): document = "the quick brown fox jumped over the lazy dog" vocabulary = Vocabulary([document], WordTokenizer(True)) self.assertRaises(ValueError, vocabulary.type, 0) self.assertRaises(ValueError, vocabulary.type, -1)
def test_vocabulary_factory(self): factory = Vocabulary.factory(WordTokenizer(True), max_vocabulary=2) self.assertEqual( Vocabulary(["to be or not to be"], WordTokenizer(True), max_vocabulary=2), factory(["to be or not to be"]))
def test_index_tokens(self): document = "the quick brown fox jumped over the lazy dog" vocabulary = Vocabulary([document], WordTokenizer(True)) np.testing.assert_equal(np.array([2, 9, 3, 5, 6, 8, 2, 7, 4]), vocabulary.index_string(document))
def test_vocabulary_factory(self): factory = Vocabulary.factory(WordTokenizer(True), max_vocabulary=2) self.assertEqual(Vocabulary(["to be or not to be"], WordTokenizer(True), max_vocabulary=2), factory(["to be or not to be"]))
def create_partitioned_data(): return PartitionedData.from_text({ "train": ["blue blue green", "red red red"], "test": ["green green red black"], "validate": ["red blue blue orange"] }, ["train"], Vocabulary.factory(WordTokenizer(True)))
def test_invalid_partition(self): self.assertRaises(ValueError, PartitionedData.from_text, { "train": ["red red red", "blue blue green"], "validate": ["red blue blue orange"], "test": ["green green red black"] }, ["bogus"], Vocabulary.factory(WordTokenizer(True)))