Exemple #1
0
def test_phrases():
    from smt.utils.utility import mkcorpus
    from smt.phrase.word_alignment import symmetrization

    sentenses = [("僕 は 男 です", "I am a man"),
                 ("私 は 女 です", "I am a girl"),
                 ("私 は 先生 です", "I am a teacher"),
                 ("彼女 は 先生 です", "She is a teacher"),
                 ("彼 は 先生 です", "He is a teacher"),
                 ]

    corpus = mkcorpus(sentenses)
    es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
    alignment = symmetrization(es, fs, corpus)
    ext = phrase_extract(es, fs, alignment)
    ans = ("は 先生 です <-> a teacher",
           "先生 <-> teacher"
           "私 <-> I am"
           "私 は 先生 です <-> I am a teacher")
    for e, f in ext:
        print("{} {} {}".format(' '.join(e), "<->", ' '.join(f)))

    ## phrases
    fs = "I am a teacher".split()
    phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
    print(phrases)
    ans = {((1, 'I'), (2, 'am')),
           ((1, 'I'), (2, 'am'), (3, 'a'), (4, 'teacher')),
           ((4, 'teacher'),),
           ((3, 'a'), (4, 'teacher'))}

    phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
    assert ans == phrases
Exemple #2
0
def test_phrases():
    from smt.utils.utility import mkcorpus
    from smt.phrase.word_alignment import symmetrization

    sentenses = [
        ("僕 は 男 です", "I am a man"),
        ("私 は 女 です", "I am a girl"),
        ("私 は 先生 です", "I am a teacher"),
        ("彼女 は 先生 です", "She is a teacher"),
        ("彼 は 先生 です", "He is a teacher"),
    ]

    corpus = mkcorpus(sentenses)
    es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
    alignment = symmetrization(es, fs, corpus)
    ext = phrase_extract(es, fs, alignment)
    ans = ("は 先生 です <-> a teacher", "先生 <-> teacher"
           "私 <-> I am"
           "私 は 先生 です <-> I am a teacher")
    for e, f in ext:
        print("{} {} {}".format(' '.join(e), "<->", ' '.join(f)))

    ## phrases
    fs = "I am a teacher".split()
    phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
    print(phrases)
    ans = {((1, 'I'), (2, 'am')),
           ((1, 'I'), (2, 'am'), (3, 'a'), (4, 'teacher')), ((4, 'teacher'), ),
           ((3, 'a'), (4, 'teacher'))}

    phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
    assert ans == phrases
Exemple #3
0
 def test_symmetrization(self):
     sentenses = [("僕 は 男 です", "I am a man"),
                  ("私 は 女 です", "I am a girl"),
                  ("私 は 先生 です", "I am a teacher"),
                  ("彼女 は 先生 です", "She is a teacher"),
                  ("彼 は 先生 です", "He is a teacher"),
                  ]
     corpus = mkcorpus(sentenses)
     es = "私 は 先生 です".split()
     fs = "I am a teacher".split()
     syn = symmetrization(es, fs, corpus)
     ans = set([(1, 1), (1, 2), (2, 3), (3, 4), (4, 3)])
     self.assertEqual(syn, ans)
Exemple #4
0
 def test_symmetrization(self):
     sentenses = [
         ("僕 は 男 です", "I am a man"),
         ("私 は 女 です", "I am a girl"),
         ("私 は 先生 です", "I am a teacher"),
         ("彼女 は 先生 です", "She is a teacher"),
         ("彼 は 先生 です", "He is a teacher"),
     ]
     corpus = mkcorpus(sentenses)
     es = "私 は 先生 です".split()
     fs = "I am a teacher".split()
     syn = symmetrization(es, fs, corpus)
     ans = set([(1, 1), (1, 2), (2, 3), (3, 4), (4, 3)])
     self.assertEqual(syn, ans)
Exemple #5
0
    def test_phrase_extract(self):
        # next alignment matrix is like
        #
        # |x| | | | | | | | | |
        # | |x|x|x| | | | | | |
        # | | | | | |x| | | | |
        # | | | | | | |x| | | |
        # | | | | | | | | | |x|
        # | | | | | | | | | |x|
        # | | | | | | | |x| | |
        # | | | | | | | |x| | |
        # | | | | | | | | |x| |
        #
        es = "michael assumes that he will stay in the house".split()
        fs = "michael geht davon aus , dass er im haus bleibt".split()
        alignment = set([(1, 1), (2, 2), (2, 3), (2, 4), (3, 6), (4, 7),
                         (5, 10), (6, 10), (7, 8), (8, 8), (9, 9)])
        ans = set([
            (('assumes', ), ('geht', 'davon', 'aus')),
            (('assumes', ), ('geht', 'davon', 'aus', ',')),
            (('assumes', 'that'), ('geht', 'davon', 'aus', ',', 'dass')),
            (('assumes', 'that', 'he'), ('geht', 'davon', 'aus', ',', 'dass',
                                         'er')),
            (('assumes', 'that', 'he', 'will', 'stay', 'in', 'the', 'house'),
             ('geht', 'davon', 'aus', ',', 'dass', 'er', 'im', 'haus',
              'bleibt')), (('he', ), ('er', )),
            (('he', 'will', 'stay', 'in', 'the', 'house'), ('er', 'im', 'haus',
                                                            'bleibt')),
            (('house', ), ('haus', )), (('in', 'the'), ('im', )),
            (('in', 'the', 'house'), ('im', 'haus')),
            (('michael', ), ('michael', )),
            (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus')),
            (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus', ',')),
            (('michael', 'assumes', 'that'), ('michael', 'geht', 'davon',
                                              'aus', ',', 'dass')),
            (('michael', 'assumes', 'that', 'he'), ('michael', 'geht', 'davon',
                                                    'aus', ',', 'dass', 'er')),
            (('michael', 'assumes', 'that', 'he', 'will', 'stay', 'in', 'the',
              'house'), ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er',
                         'im', 'haus', 'bleibt')), (('that', ), (',', 'dass')),
            (('that', ), ('dass', )), (('that', 'he'), (',', 'dass', 'er')),
            (('that', 'he'), ('dass', 'er')),
            (('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
             (',', 'dass', 'er', 'im', 'haus', 'bleibt')),
            (('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
             ('dass', 'er', 'im', 'haus', 'bleibt')),
            (('will', 'stay'), ('bleibt', )),
            (('will', 'stay', 'in', 'the', 'house'), ('im', 'haus', 'bleibt'))
        ])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)

        # another test
        es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
        sentenses = [
            ("僕 は 男 です", "I am a man"),
            ("私 は 女 です", "I am a girl"),
            ("私 は 先生 です", "I am a teacher"),
            ("彼女 は 先生 です", "She is a teacher"),
            ("彼 は 先生 です", "He is a teacher"),
        ]
        corpus = mkcorpus(sentenses)
        alignment = symmetrization(es, fs, corpus)
        ans = set([(('\xe3\x81\xaf', '\xe5\x85\x88\xe7\x94\x9f',
                     '\xe3\x81\xa7\xe3\x81\x99'), ('a', 'teacher')),
                   (('\xe5\x85\x88\xe7\x94\x9f', ), ('teacher', )),
                   (('\xe7\xa7\x81', ), ('I', 'am')),
                   (('\xe7\xa7\x81', '\xe3\x81\xaf',
                     '\xe5\x85\x88\xe7\x94\x9f', '\xe3\x81\xa7\xe3\x81\x99'),
                    ('I', 'am', 'a', 'teacher'))])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)
Exemple #6
0
    def test_phrase_extract(self):
        # next alignment matrix is like
        #
        # |x| | | | | | | | | |
        # | |x|x|x| | | | | | |
        # | | | | | |x| | | | |
        # | | | | | | |x| | | |
        # | | | | | | | | | |x|
        # | | | | | | | | | |x|
        # | | | | | | | |x| | |
        # | | | | | | | |x| | |
        # | | | | | | | | |x| |
        #
        es = "michael assumes that he will stay in the house".split()
        fs = "michael geht davon aus , dass er im haus bleibt".split()
        alignment = set([(1, 1),
                         (2, 2),
                         (2, 3),
                         (2, 4),
                         (3, 6),
                         (4, 7),
                         (5, 10),
                         (6, 10),
                         (7, 8),
                         (8, 8),
                         (9, 9)])
        ans = set([(('assumes',), ('geht', 'davon', 'aus')),
                   (('assumes',), ('geht', 'davon', 'aus', ',')),
                   (('assumes', 'that'),
                    ('geht', 'davon', 'aus', ',', 'dass')),
                   (('assumes', 'that', 'he'),
                    ('geht', 'davon', 'aus', ',', 'dass', 'er')),
                   (('assumes', 'that', 'he',
                     'will', 'stay', 'in', 'the', 'house'),
                    ('geht', 'davon', 'aus', ',', 'dass',
                     'er', 'im', 'haus', 'bleibt')),
                   (('he',), ('er',)),
                   (('he', 'will', 'stay', 'in', 'the', 'house'),
                    ('er', 'im', 'haus', 'bleibt')),
                   (('house',), ('haus',)),
                   (('in', 'the'), ('im',)),
                   (('in', 'the', 'house'), ('im', 'haus')),
                   (('michael',), ('michael',)),
                   (('michael', 'assumes'),
                    ('michael', 'geht', 'davon', 'aus')),
                   (('michael', 'assumes'),
                    ('michael', 'geht', 'davon', 'aus', ',')),
                   (('michael', 'assumes', 'that'),
                    ('michael', 'geht', 'davon', 'aus', ',', 'dass')),
                   (('michael', 'assumes', 'that', 'he'),
                    ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er')),
                   (('michael',
                     'assumes',
                     'that',
                     'he',
                     'will',
                     'stay',
                     'in',
                     'the',
                     'house'),
                    ('michael',
                     'geht',
                     'davon',
                     'aus',
                     ',',
                     'dass',
                     'er',
                     'im',
                     'haus',
                     'bleibt')),
                   (('that',), (',', 'dass')),
                   (('that',), ('dass',)),
                   (('that', 'he'), (',', 'dass', 'er')),
                   (('that', 'he'), ('dass', 'er')),
                   (('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
                    (',', 'dass', 'er', 'im', 'haus', 'bleibt')),
                   (('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
                    ('dass', 'er', 'im', 'haus', 'bleibt')),
                   (('will', 'stay'), ('bleibt',)),
                   (('will', 'stay', 'in', 'the', 'house'),
                    ('im', 'haus', 'bleibt'))])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)

        # another test
        es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
        sentenses = [("僕 は 男 です", "I am a man"),
                     ("私 は 女 です", "I am a girl"),
                     ("私 は 先生 です", "I am a teacher"),
                     ("彼女 は 先生 です", "She is a teacher"),
                     ("彼 は 先生 です", "He is a teacher"),
                     ]
        corpus = mkcorpus(sentenses)
        alignment = symmetrization(es, fs, corpus)
        ans = set([(('\xe3\x81\xaf',
                     '\xe5\x85\x88\xe7\x94\x9f',
                     '\xe3\x81\xa7\xe3\x81\x99'),
                    ('a', 'teacher')),
                   (('\xe5\x85\x88\xe7\x94\x9f',), ('teacher',)),
                   (('\xe7\xa7\x81',), ('I', 'am')),
                   (('\xe7\xa7\x81',
                     '\xe3\x81\xaf',
                     '\xe5\x85\x88\xe7\x94\x9f',
                     '\xe3\x81\xa7\xe3\x81\x99'),
                    ('I', 'am', 'a', 'teacher'))])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)