コード例 #1
0
ファイル: test_text.py プロジェクト: wpm/tfrnnlm
 def test_out_of_vocabulary(self):
     vocabulary = Vocabulary(["a a a b b OOV c"], WordTokenizer(True), out_of_vocabulary="OOV")
     self.assertEqual(1, vocabulary.index("OOV"))
     self.assertEqual(1, vocabulary.index("z"))
     self.assertEqual(2, vocabulary.index("a"))
     self.assertEqual(3, vocabulary.index("b"))
     self.assertEqual(4, vocabulary.index("c"))
コード例 #2
0
 def test_out_of_vocabulary(self):
     vocabulary = Vocabulary(["a a a b b OOV c"],
                             WordTokenizer(True),
                             out_of_vocabulary="OOV")
     self.assertEqual(1, vocabulary.index("OOV"))
     self.assertEqual(1, vocabulary.index("z"))
     self.assertEqual(2, vocabulary.index("a"))
     self.assertEqual(3, vocabulary.index("b"))
     self.assertEqual(4, vocabulary.index("c"))
コード例 #3
0
ファイル: test_text.py プロジェクト: wpm/tfrnnlm
 def test_vocabulary_serialization(self):
     vocabulary_name = os.path.join(self.directory, "vocabulary.pkl")
     vocabulary = Vocabulary(["the quick brown fox jumped over the lazy dog"], WordTokenizer(True))
     self._serialize(vocabulary_name, vocabulary)
     self.assertTrue(os.path.isfile(vocabulary_name))
     deserialized_vocabulary = self._deserialize(vocabulary_name)
     self.assertEqual(vocabulary, deserialized_vocabulary)
     s = "The quick black fox"
     np.testing.assert_equal(vocabulary.index_string(s), deserialized_vocabulary.index_string(s))
コード例 #4
0
 def test_vocabulary_serialization(self):
     vocabulary_name = os.path.join(self.directory, "vocabulary.pkl")
     vocabulary = Vocabulary(
         ["the quick brown fox jumped over the lazy dog"],
         WordTokenizer(True))
     self._serialize(vocabulary_name, vocabulary)
     self.assertTrue(os.path.isfile(vocabulary_name))
     deserialized_vocabulary = self._deserialize(vocabulary_name)
     self.assertEqual(vocabulary, deserialized_vocabulary)
     s = "The quick black fox"
     np.testing.assert_equal(vocabulary.index_string(s),
                             deserialized_vocabulary.index_string(s))
コード例 #5
0
 def test_invalid_partition(self):
     self.assertRaises(
         ValueError, PartitionedData.from_text, {
             "train": ["red red red", "blue blue green"],
             "validate": ["red blue blue orange"],
             "test": ["green green red black"]
         }, ["bogus"], Vocabulary.factory(WordTokenizer(True)))
コード例 #6
0
ファイル: __init__.py プロジェクト: ycchuang/tfrnnlm
def create_partitioned_data():
    return PartitionedData.from_text(
        {
            "train": ["blue blue green", "red red red"],
            "test": ["green green red black"],
            "validate": ["red blue blue orange"]
        }, ["train"], Vocabulary.factory(WordTokenizer(True)))
コード例 #7
0
 def test_limited_vocabulary(self):
     v = Vocabulary(["to be or not to be"],
                    WordTokenizer(True),
                    max_vocabulary=2)
     self.assertEqual({"to", "be"}, set(v.type_to_index.keys()))
     self.assertEqual(2, len(v))
     v = Vocabulary(["hamlet hamlet hamlet to be or not to be"],
                    WordTokenizer(True),
                    min_frequency=2)
     self.assertEqual({"to", "be", "hamlet"}, set(v.type_to_index.keys()))
     self.assertEqual(3, len(v))
     v = Vocabulary(["hamlet hamlet hamlet to be or not to be"],
                    WordTokenizer(True),
                    max_vocabulary=2,
                    min_frequency=2)
     self.assertEqual({"be", "hamlet"}, set(v.type_to_index.keys()))
     self.assertEqual(2, len(v))
コード例 #8
0
 def test_full_vocabulary(self):
     v = Vocabulary(["the quick brown fox jumped over the lazy dog"],
                    WordTokenizer(True))
     self.assertEqual(
         "Vocabulary, size 8: None:1 the:2 brown:3 dog:4 fox:5 ...", str(v))
     self.assertEqual(
         {"the", "quick", "brown", "fox", "jumped", "over", "lazy", "dog"},
         set(v.type_to_index.keys()))
     self.assertEqual(8, len(v))
コード例 #9
0
ファイル: command.py プロジェクト: wpm/tfrnnlm
def create_data_set(args):
    if args.vocabulary_partitions is None:
        args.vocabulary_partitions = args.partitions.keys()
    tokenizer = {"whitespace": WhitespaceTokenizer,
                 "word": WordTokenizer,
                 "character": CharacterTokenizer}[args.tokenizer](args.case_normalized)
    factory = Vocabulary.factory(tokenizer,
                                 min_frequency=args.min_frequency, max_vocabulary=args.max_vocabulary,
                                 out_of_vocabulary=args.out_of_vocabulary)
    partitions = {}
    for partition, filenames in args.partitions.items():
        partitions[partition] = [open(filename).read() for filename in filenames]
    partitioned_data = PartitionedData.from_text(partitions, args.vocabulary_partitions, factory)
    partitioned_data.serialize(args.directory)
    logger.info("Created %s in %s" % (partitioned_data, args.directory))
コード例 #10
0
ファイル: command.py プロジェクト: ycchuang/tfrnnlm
def create_data_set(args):
    if args.vocabulary_partitions is None:
        args.vocabulary_partitions = args.partitions.keys()
    tokenizer = {
        "whitespace": WhitespaceTokenizer,
        "word": WordTokenizer,
        "character": CharacterTokenizer
    }[args.tokenizer](args.case_normalized)
    factory = Vocabulary.factory(tokenizer,
                                 min_frequency=args.min_frequency,
                                 max_vocabulary=args.max_vocabulary,
                                 out_of_vocabulary=args.out_of_vocabulary)
    partitions = {}
    for partition, filenames in args.partitions.items():
        partitions[partition] = [
            open(filename).read() for filename in filenames
        ]
    partitioned_data = PartitionedData.from_text(partitions,
                                                 args.vocabulary_partitions,
                                                 factory)
    partitioned_data.serialize(args.directory)
    logger.info("Created %s in %s" % (partitioned_data, args.directory))
コード例 #11
0
 def test_invalid_index(self):
     document = "the quick brown fox jumped over the lazy dog"
     vocabulary = Vocabulary([document], WordTokenizer(True))
     self.assertRaises(ValueError, vocabulary.type, 0)
     self.assertRaises(ValueError, vocabulary.type, -1)
コード例 #12
0
 def test_vocabulary_factory(self):
     factory = Vocabulary.factory(WordTokenizer(True), max_vocabulary=2)
     self.assertEqual(
         Vocabulary(["to be or not to be"],
                    WordTokenizer(True),
                    max_vocabulary=2), factory(["to be or not to be"]))
コード例 #13
0
ファイル: test_text.py プロジェクト: wpm/tfrnnlm
 def test_index_tokens(self):
     document = "the quick brown fox jumped over the lazy dog"
     vocabulary = Vocabulary([document], WordTokenizer(True))
     np.testing.assert_equal(np.array([2, 9, 3, 5, 6, 8, 2, 7, 4]), vocabulary.index_string(document))
コード例 #14
0
ファイル: test_text.py プロジェクト: wpm/tfrnnlm
 def test_vocabulary_factory(self):
     factory = Vocabulary.factory(WordTokenizer(True), max_vocabulary=2)
     self.assertEqual(Vocabulary(["to be or not to be"], WordTokenizer(True), max_vocabulary=2),
                      factory(["to be or not to be"]))
コード例 #15
0
 def test_index_tokens(self):
     document = "the quick brown fox jumped over the lazy dog"
     vocabulary = Vocabulary([document], WordTokenizer(True))
     np.testing.assert_equal(np.array([2, 9, 3, 5, 6, 8, 2, 7, 4]),
                             vocabulary.index_string(document))
コード例 #16
0
ファイル: __init__.py プロジェクト: wpm/tfrnnlm
def create_partitioned_data():
    return PartitionedData.from_text({
        "train": ["blue blue green", "red red red"],
        "test": ["green green red black"],
        "validate": ["red blue blue orange"]
    }, ["train"], Vocabulary.factory(WordTokenizer(True)))
コード例 #17
0
ファイル: test_text.py プロジェクト: wpm/tfrnnlm
 def test_invalid_partition(self):
     self.assertRaises(ValueError, PartitionedData.from_text, {
         "train": ["red red red", "blue blue green"],
         "validate": ["red blue blue orange"],
         "test": ["green green red black"]
     }, ["bogus"], Vocabulary.factory(WordTokenizer(True)))