Ejemplo n.º 1
0
    def test_create_minimum_transducer2(self):
        inputs1 = [('さくら'.encode('utf8'), '白'.encode('utf8')),
                   ('さくらんぼ'.encode('utf8'), '赤'.encode('utf8')),
                   ('すもも'.encode('utf8'), '赤'.encode('utf8'))]
        inputs2 = [('なし'.encode('utf8'), '茶'.encode('utf8')),
                   ('もも'.encode('utf8'), '桃'.encode('utf8'))]

        processed, dictionary1 = fst.create_minimum_transducer(inputs1)
        processed, dictionary2 = fst.create_minimum_transducer(inputs2)
        data = [fst.compileFST(dictionary1), fst.compileFST(dictionary2)]

        m = Matcher(data)
        # accepted strings
        self.assertEqual((True, set(['白'.encode('utf8')])),
                         m.run('さくら'.encode('utf8')))
        self.assertEqual((True, set(['白'.encode('utf8'), '赤'.encode('utf8')])),
                         m.run('さくらんぼ'.encode('utf8')))
        self.assertEqual((True, set(['白'.encode('utf8')])),
                         m.run('さくらさく'.encode('utf8')))
        self.assertEqual((True, set(['赤'.encode('utf8')])),
                         m.run('すもも'.encode('utf8')))
        self.assertEqual((True, set(['茶'.encode('utf8')])),
                         m.run('なし'.encode('utf8')))
        self.assertEqual((True, set(['桃'.encode('utf8')])),
                         m.run('もも'.encode('utf8')))
        # not accepted string
        self.assertEqual((False, set()), m.run('みかん'.encode('utf8')))
Ejemplo n.º 2
0
    def test_create_minimum_transducer2(self):
        inputs1 = [
            (u'さくら'.encode('utf8'), u'白'.encode('utf8')),
            (u'さくらんぼ'.encode('utf8'), u'赤'.encode('utf8')),
            (u'すもも'.encode('utf8'), u'赤'.encode('utf8'))
        ]
        inputs2 = [
            (u'なし'.encode('utf8'), u'茶'.encode('utf8')),
            (u'もも'.encode('utf8'), u'桃'.encode('utf8'))
        ]
        
        processed, dictionary1 = fst.create_minimum_transducer(inputs1)
        processed, dictionary2 = fst.create_minimum_transducer(inputs2)
        data = [fst.compileFST(dictionary1), fst.compileFST(dictionary2)]

        m = Matcher(data)
        # accepted strings
        self.assertEqual((True, set([u'白'.encode('utf8')])), m.run(u'さくら'.encode('utf8')))
        self.assertEqual((True, set([u'白'.encode('utf8'), u'赤'.encode(u'utf8')])), m.run(u'さくらんぼ'.encode('utf8')))
        self.assertEqual((True, set([u'白'.encode('utf8')])), m.run(u'さくらさく'.encode('utf8')))
        self.assertEqual((True, set([u'赤'.encode('utf8')])), m.run(u'すもも'.encode('utf8')))
        self.assertEqual((True, set([u'茶'.encode('utf8')])), m.run(u'なし'.encode('utf8')))
        self.assertEqual((True, set([u'桃'.encode('utf8')])), m.run(u'もも'.encode('utf8')))
        # not accepted string
        self.assertEqual((False, set()), m.run(u'みかん'.encode('utf8')))
Ejemplo n.º 3
0
    def test_create_minimum_transducer1(self):
        inputs1 = [('apr'.encode('utf8'), pack('I', 30)),
                   ('aug'.encode('utf8'), pack('I', 31)),
                   ('dec'.encode('utf8'), pack('I', 31)),
                   ('feb'.encode('utf8'), pack('I', 28))]
        inputs2 = [('feb'.encode('utf8'), pack('I', 29)),
                   ('jan'.encode('utf8'), pack('I', 31)),
                   ('jul'.encode('utf8'), pack('I', 31)),
                   ('jun'.encode('utf8'), pack('I', 30)),
                   ('may'.encode('utf8'), pack('I', 31))]
        processed, dictionary1 = fst.create_minimum_transducer(inputs1)
        processed, dictionary2 = fst.create_minimum_transducer(inputs2)
        data = [fst.compileFST(dictionary1), fst.compileFST(dictionary2)]

        m = Matcher(data)
        # accepted strings
        self.assertEqual((True, set([pack('I', 30)])),
                         m.run('apr'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 31)])),
                         m.run('aug'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 31)])),
                         m.run('dec'.encode('utf8')))
        self.assertEqual(
            (True, set([pack('I', 28), pack('I', 29)])),
            m.run('feb'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 31)])),
                         m.run('jan'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 31)])),
                         m.run('jul'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 30)])),
                         m.run('jun'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 31)])),
                         m.run('may'.encode('utf8')))
        # not accepted string
        self.assertEqual((False, set()), m.run('mar'))
Ejemplo n.º 4
0
    def test_create_minimum_transducer1(self):
        inputs1 = [
            ('apr'.encode('utf8'), pack('I', 30)),
            ('aug'.encode('utf8'), pack('I', 31)),
            ('dec'.encode('utf8'), pack('I', 31)),
            ('feb'.encode('utf8'), pack('I', 28))
        ]
        inputs2 = [
            ('feb'.encode('utf8'), pack('I', 29)),
            ('jan'.encode('utf8'), pack('I', 31)),
            ('jul'.encode('utf8'), pack('I', 31)),
            ('jun'.encode('utf8'), pack('I', 30)),
            ('may'.encode('utf8'), pack('I', 31))
        ]
        processed, dictionary1 = fst.create_minimum_transducer(inputs1)
        processed, dictionary2 = fst.create_minimum_transducer(inputs2)
        data = [fst.compileFST(dictionary1), fst.compileFST(dictionary2)]

        m = Matcher(data)
        # accepted strings
        self.assertEqual((True, set([pack('I', 30)])), m.run('apr'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 31)])), m.run('aug'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 31)])), m.run('dec'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 28), pack('I', 29)])), m.run('feb'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 31)])), m.run('jan'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 31)])), m.run('jul'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 30)])), m.run('jun'.encode('utf8')))
        self.assertEqual((True, set([pack('I', 31)])), m.run('may'.encode('utf8')))
        # not accepted string
        self.assertEqual((False, set()), m.run('mar'))
Ejemplo n.º 5
0
    def test_create_minimum_transducer2(self):
        inputs = [(u'さくら'.encode('utf8'), u'白'.encode('utf8')),
                  (u'さくらんぼ'.encode('utf8'), u'赤'.encode('utf8')),
                  (u'すもも'.encode('utf8'), u'赤'.encode('utf8')),
                  (u'なし'.encode('utf8'), u'茶'.encode('utf8')),
                  (u'もも'.encode('utf8'), u'桃'.encode('utf8'))]
        dictionary = fst.create_minimum_transducer(inputs)
        data = fst.compileFST(dictionary)

        m = Matcher(data)
        # accepted strings
        self.assertEqual((True, set([u'白'.encode('utf8')])),
                         m.run(u'さくら'.encode('utf8')))
        self.assertEqual(
            (True, set([u'白'.encode('utf8'), u'赤'.encode(u'utf8')])),
            m.run(u'さくらんぼ'.encode('utf8')))
        self.assertEqual((True, set([u'白'.encode('utf8')])),
                         m.run(u'さくらさく'.encode('utf8')))
        self.assertEqual((True, set([u'赤'.encode('utf8')])),
                         m.run(u'すもも'.encode('utf8')))
        self.assertEqual((True, set([u'茶'.encode('utf8')])),
                         m.run(u'なし'.encode('utf8')))
        self.assertEqual((True, set([u'桃'.encode('utf8')])),
                         m.run(u'もも'.encode('utf8')))
        # not accepted string
        self.assertEqual((False, set()), m.run(u'みかん'.encode('utf8')))
Ejemplo n.º 6
0
    def test_matcher_cache(self):
        inputs = [
            (u'す'.encode('utf8'), pack('I', 1)),
            (u'すも'.encode('utf8'), pack('I', 2)),
            (u'すもも'.encode('utf8'), pack('I', 3))
        ]
        processed, dictionary = fst.create_minimum_transducer(inputs)
        data = [fst.compileFST(dictionary)]

        m = Matcher(data)
        # matches 'す', 'すも', 'すもも'
        self.assertEqual(
            (True, set([pack('I', 1), pack('I', 2), pack('I', 3)])),
            m.run(u'すもも'.encode('utf8'), True))
        self.assertEqual(
            (True, set([pack('I', 1), pack('I', 2)])),
            m.run(u'すもうとり'.encode('utf8'), True))
        self.assertEqual(
            (True, set([pack('I', 1), pack('I', 2), pack('I', 3)])),
            m.run(u'すもも'.encode('utf8'), True))
        self.assertEqual(
            (True, set([pack('I', 1), pack('I', 2), pack('I', 3)])),
            m.run(u'すもももももももものうち'.encode('utf8'), True))
        self.assertEqual(
            (True, set([pack('I', 1)])),
            m.run(u'す'.encode('utf8'), True))
Ejemplo n.º 7
0
def save_partial_fst(arg, outdir):
    part_idx, part_file = arg
    with open(part_file, 'rb') as f:
        _part = pickle.load(f)
        _processed, fst = create_minimum_transducer(_part)
        compiledFST = compileFST(fst)
        save_fstdata(compiledFST, dir=outdir, part=part_idx)
        logger.info('processed entries=%d' % _processed)
        return _processed
Ejemplo n.º 8
0
    def test_common_prefix_match(self):
        inputs = [(u'す'.encode('utf8'), pack('I', 1)),
                  (u'すも'.encode('utf8'), pack('I', 2)),
                  (u'すもも'.encode('utf8'), pack('I', 3))]
        dictionary = fst.create_minimum_transducer(inputs)
        data = fst.compileFST(dictionary)

        m = Matcher(data)
        # matches 'す', 'すも', 'すもも'
        expected_outputs = set([pack('I', 1), pack('I', 2), pack('I', 3)])
        self.assertEqual((True, expected_outputs),
                         m.run(u'すもも'.encode('utf8'), True))
Ejemplo n.º 9
0
    def test_perfect_match(self):
        inputs = [('す'.encode('utf8'), pack('I', 1)),
                  ('すも'.encode('utf8'), pack('I', 2)),
                  ('すもも'.encode('utf8'), pack('I', 3))]
        processed, dictionary = fst.create_minimum_transducer(inputs)
        data = [fst.compileFST(dictionary)]

        m = Matcher(data)
        # matches 'すもも'
        expected_outputs = set([pack('I', 3)])
        self.assertEqual((True, expected_outputs),
                         m.run('すもも'.encode('utf8'), False))
Ejemplo n.º 10
0
    def test_common_prefix_match(self):
        inputs = [
            (u'す'.encode('utf8'), pack('I', 1)),
            (u'すも'.encode('utf8'), pack('I', 2)),
            (u'すもも'.encode('utf8'), pack('I', 3))
        ]
        processed, dictionary = fst.create_minimum_transducer(inputs)
        data = [fst.compileFST(dictionary)]

        m = Matcher(data)
        # matches 'す', 'すも', 'すもも'
        expected_outputs = set([pack('I', 1), pack('I', 2), pack('I', 3)])
        self.assertEqual((True, expected_outputs), m.run(u'すもも'.encode('utf8'), True))
Ejemplo n.º 11
0
    def test_perfect_match(self):
        inputs = [
            (u'す'.encode('utf8'), pack('I', 1)),
            (u'すも'.encode('utf8'), pack('I', 2)),
            (u'すもも'.encode('utf8'), pack('I', 3))
        ]
        dictionary = fst.create_minimum_transducer(inputs)
        data = fst.compileFST(dictionary)

        m = Matcher(data)
        # matches 'すもも'
        expected_outputs = set([pack('I', 3)])
        self.assertEqual((True, expected_outputs), m.run(u'すもも'.encode('utf8'), False))
Ejemplo n.º 12
0
    def test_matcher_cache(self):
        inputs = [(u'す'.encode('utf8'), pack('I', 1)),
                  (u'すも'.encode('utf8'), pack('I', 2)),
                  (u'すもも'.encode('utf8'), pack('I', 3))]
        dictionary = fst.create_minimum_transducer(inputs)
        data = fst.compileFST(dictionary)

        m = Matcher(data)
        # matches 'す', 'すも', 'すもも'
        self.assertEqual(
            (True, set([pack('I', 1), pack('I', 2),
                        pack('I', 3)])), m.run(u'すもも'.encode('utf8'), True))
        self.assertEqual((True, set([pack('I', 1), pack('I', 2)])),
                         m.run(u'すもうとり'.encode('utf8'), True))
        self.assertEqual(
            (True, set([pack('I', 1), pack('I', 2),
                        pack('I', 3)])), m.run(u'すもも'.encode('utf8'), True))
        self.assertEqual(
            (True, set([pack('I', 1), pack('I', 2),
                        pack('I', 3)])),
            m.run(u'すもももももももものうち'.encode('utf8'), True))
        self.assertEqual((True, set([pack('I', 1)])),
                         m.run(u'す'.encode('utf8'), True))