Esempio n. 1
0
def test_phrases():
    from smt.utils.utility import mkcorpus
    from smt.phrase.word_alignment import symmetrization

    sentenses = [
        ("僕 は 男 です", "I am a man"),
        ("私 は 女 です", "I am a girl"),
        ("私 は 先生 です", "I am a teacher"),
        ("彼女 は 先生 です", "She is a teacher"),
        ("彼 は 先生 です", "He is a teacher"),
    ]

    corpus = mkcorpus(sentenses)
    es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
    alignment = symmetrization(es, fs, corpus)
    ext = phrase_extract(es, fs, alignment)
    ans = ("は 先生 です <-> a teacher", "先生 <-> teacher"
           "私 <-> I am"
           "私 は 先生 です <-> I am a teacher")
    for e, f in ext:
        print("{} {} {}".format(' '.join(e), "<->", ' '.join(f)))

    ## phrases
    fs = "I am a teacher".split()
    phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
    print(phrases)
    ans = {((1, 'I'), (2, 'am')),
           ((1, 'I'), (2, 'am'), (3, 'a'), (4, 'teacher')), ((4, 'teacher'), ),
           ((3, 'a'), (4, 'teacher'))}

    phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
    assert ans == phrases
Esempio n. 2
0
def test_phrases():
    from smt.utils.utility import mkcorpus
    from smt.phrase.word_alignment import symmetrization

    sentenses = [("僕 は 男 です", "I am a man"),
                 ("私 は 女 です", "I am a girl"),
                 ("私 は 先生 です", "I am a teacher"),
                 ("彼女 は 先生 です", "She is a teacher"),
                 ("彼 は 先生 です", "He is a teacher"),
                 ]

    corpus = mkcorpus(sentenses)
    es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
    alignment = symmetrization(es, fs, corpus)
    ext = phrase_extract(es, fs, alignment)
    ans = ("は 先生 です <-> a teacher",
           "先生 <-> teacher"
           "私 <-> I am"
           "私 は 先生 です <-> I am a teacher")
    for e, f in ext:
        print("{} {} {}".format(' '.join(e), "<->", ' '.join(f)))

    ## phrases
    fs = "I am a teacher".split()
    phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
    print(phrases)
    ans = {((1, 'I'), (2, 'am')),
           ((1, 'I'), (2, 'am'), (3, 'a'), (4, 'teacher')),
           ((4, 'teacher'),),
           ((3, 'a'), (4, 'teacher'))}

    phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
    assert ans == phrases
Esempio n. 3
0
 def test_symmetrization(self):
     sentenses = [("僕 は 男 です", "I am a man"),
                  ("私 は 女 です", "I am a girl"),
                  ("私 は 先生 です", "I am a teacher"),
                  ("彼女 は 先生 です", "She is a teacher"),
                  ("彼 は 先生 です", "He is a teacher"),
                  ]
     corpus = mkcorpus(sentenses)
     es = "私 は 先生 です".split()
     fs = "I am a teacher".split()
     syn = symmetrization(es, fs, corpus)
     ans = set([(1, 1), (1, 2), (2, 3), (3, 4), (4, 3)])
     self.assertEqual(syn, ans)
Esempio n. 4
0
 def test_symmetrization(self):
     sentenses = [
         ("僕 は 男 です", "I am a man"),
         ("私 は 女 です", "I am a girl"),
         ("私 は 先生 です", "I am a teacher"),
         ("彼女 は 先生 です", "She is a teacher"),
         ("彼 は 先生 です", "He is a teacher"),
     ]
     corpus = mkcorpus(sentenses)
     es = "私 は 先生 です".split()
     fs = "I am a teacher".split()
     syn = symmetrization(es, fs, corpus)
     ans = set([(1, 1), (1, 2), (2, 3), (3, 4), (4, 3)])
     self.assertEqual(syn, ans)
Esempio n. 5
0
def train(sentences, loop_count=1000):
    corpus = utility.mkcorpus(sentences)
    return _train(corpus, loop_count)
Esempio n. 6
0
def train(sentences, loop_count=1000):
    corpus = utility.mkcorpus(sentences)
    return _train(corpus, loop_count)
Esempio n. 7
0
if __name__ == '__main__':

    # test2
    from smt.utils.utility import mkcorpus
    from word_alignment import alignment
    from smt.ibmmodel import ibmmodel2
    import sys

    delimiter = ","
    # load file which will be trained
    modelfd = open(sys.argv[1])
    sentenses = [line.rstrip().split(delimiter) for line
                 in modelfd.readlines()]
    # make corpus
    corpus = mkcorpus(sentenses)

    # train model from corpus
    f2e_train = ibmmodel2._train(corpus, loop_count=10)
    e2f_corpus = list(zip(*reversed(list(zip(*corpus)))))
    e2f_train = ibmmodel2._train(e2f_corpus, loop_count=10)

    # phrase extraction
    for line in sys.stdin:
        _es, _fs = line.rstrip().split(delimiter)
        es = _es.split()
        fs = _fs.split()

        f2e = ibmmodel2.viterbi_alignment(es, fs, *f2e_train).items()
        e2f = ibmmodel2.viterbi_alignment(fs, es, *e2f_train).items()
        align = alignment(es, fs, e2f, f2e)  # symmetrized alignment
Esempio n. 8
0
    #                 (6, 10),
    #                 (7, 8),
    #                 (8, 8),
    #                 (9, 9)])
    #pprint(phrase_extract(es, fs, alignment))

    # test2
    from smt.utils.utility import mkcorpus
    from word_alignment import symmetrization
    sentenses = [
        ("僕 は 男 です", "I am a man"),
        ("私 は 女 です", "I am a girl"),
        ("私 は 先生 です", "I am a teacher"),
        ("彼女 は 先生 です", "She is a teacher"),
        ("彼 は 先生 です", "He is a teacher"),
    ]
    corpus = mkcorpus(sentenses)
    es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
    alignment = symmetrization(es, fs, corpus)
    ext = phrase_extract(es, fs, alignment)
    pprint(ext)
    for e, f in ext:
        print(' '.join(e), "<->", ' '.join(f))

    ## phrases
    fs = "I am a teacher".split()
    phrases = set([("I", "am"), ("a", "teacher"), ("teacher", ),
                   ("I", "am", "a", "teacher")])
    phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext])
    print(phrases)
Esempio n. 9
0
 def train(self, loop_count):
     #for i, j in sentences:
     #    print(i, j)
     #
     corpus = utility.mkcorpus(self.sentences)
     return self._train(corpus, loop_count)
Esempio n. 10
0
    def test_phrase_extract(self):
        # next alignment matrix is like
        #
        # |x| | | | | | | | | |
        # | |x|x|x| | | | | | |
        # | | | | | |x| | | | |
        # | | | | | | |x| | | |
        # | | | | | | | | | |x|
        # | | | | | | | | | |x|
        # | | | | | | | |x| | |
        # | | | | | | | |x| | |
        # | | | | | | | | |x| |
        #
        es = "michael assumes that he will stay in the house".split()
        fs = "michael geht davon aus , dass er im haus bleibt".split()
        alignment = set([(1, 1), (2, 2), (2, 3), (2, 4), (3, 6), (4, 7),
                         (5, 10), (6, 10), (7, 8), (8, 8), (9, 9)])
        ans = set([
            (('assumes', ), ('geht', 'davon', 'aus')),
            (('assumes', ), ('geht', 'davon', 'aus', ',')),
            (('assumes', 'that'), ('geht', 'davon', 'aus', ',', 'dass')),
            (('assumes', 'that', 'he'), ('geht', 'davon', 'aus', ',', 'dass',
                                         'er')),
            (('assumes', 'that', 'he', 'will', 'stay', 'in', 'the', 'house'),
             ('geht', 'davon', 'aus', ',', 'dass', 'er', 'im', 'haus',
              'bleibt')), (('he', ), ('er', )),
            (('he', 'will', 'stay', 'in', 'the', 'house'), ('er', 'im', 'haus',
                                                            'bleibt')),
            (('house', ), ('haus', )), (('in', 'the'), ('im', )),
            (('in', 'the', 'house'), ('im', 'haus')),
            (('michael', ), ('michael', )),
            (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus')),
            (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus', ',')),
            (('michael', 'assumes', 'that'), ('michael', 'geht', 'davon',
                                              'aus', ',', 'dass')),
            (('michael', 'assumes', 'that', 'he'), ('michael', 'geht', 'davon',
                                                    'aus', ',', 'dass', 'er')),
            (('michael', 'assumes', 'that', 'he', 'will', 'stay', 'in', 'the',
              'house'), ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er',
                         'im', 'haus', 'bleibt')), (('that', ), (',', 'dass')),
            (('that', ), ('dass', )), (('that', 'he'), (',', 'dass', 'er')),
            (('that', 'he'), ('dass', 'er')),
            (('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
             (',', 'dass', 'er', 'im', 'haus', 'bleibt')),
            (('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
             ('dass', 'er', 'im', 'haus', 'bleibt')),
            (('will', 'stay'), ('bleibt', )),
            (('will', 'stay', 'in', 'the', 'house'), ('im', 'haus', 'bleibt'))
        ])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)

        # another test
        es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
        sentenses = [
            ("僕 は 男 です", "I am a man"),
            ("私 は 女 です", "I am a girl"),
            ("私 は 先生 です", "I am a teacher"),
            ("彼女 は 先生 です", "She is a teacher"),
            ("彼 は 先生 です", "He is a teacher"),
        ]
        corpus = mkcorpus(sentenses)
        alignment = symmetrization(es, fs, corpus)
        ans = set([(('\xe3\x81\xaf', '\xe5\x85\x88\xe7\x94\x9f',
                     '\xe3\x81\xa7\xe3\x81\x99'), ('a', 'teacher')),
                   (('\xe5\x85\x88\xe7\x94\x9f', ), ('teacher', )),
                   (('\xe7\xa7\x81', ), ('I', 'am')),
                   (('\xe7\xa7\x81', '\xe3\x81\xaf',
                     '\xe5\x85\x88\xe7\x94\x9f', '\xe3\x81\xa7\xe3\x81\x99'),
                    ('I', 'am', 'a', 'teacher'))])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)
Esempio n. 11
0
def train(sentences, loop_count=1000):
    #for i, j in sentences:
    #    print(i, j)
    corpus = utility.mkcorpus(sentences)
    return _train(corpus, loop_count)
Esempio n. 12
0
def train(sentences, loop_count=1000):
    #for i, j in sentences:
    #    print(i, j)
    corpus = utility.mkcorpus(sentences)
    return _train(corpus, loop_count)
Esempio n. 13
0
    def test_phrase_extract(self):
        # next alignment matrix is like
        #
        # |x| | | | | | | | | |
        # | |x|x|x| | | | | | |
        # | | | | | |x| | | | |
        # | | | | | | |x| | | |
        # | | | | | | | | | |x|
        # | | | | | | | | | |x|
        # | | | | | | | |x| | |
        # | | | | | | | |x| | |
        # | | | | | | | | |x| |
        #
        es = "michael assumes that he will stay in the house".split()
        fs = "michael geht davon aus , dass er im haus bleibt".split()
        alignment = set([(1, 1),
                         (2, 2),
                         (2, 3),
                         (2, 4),
                         (3, 6),
                         (4, 7),
                         (5, 10),
                         (6, 10),
                         (7, 8),
                         (8, 8),
                         (9, 9)])
        ans = set([(('assumes',), ('geht', 'davon', 'aus')),
                   (('assumes',), ('geht', 'davon', 'aus', ',')),
                   (('assumes', 'that'),
                    ('geht', 'davon', 'aus', ',', 'dass')),
                   (('assumes', 'that', 'he'),
                    ('geht', 'davon', 'aus', ',', 'dass', 'er')),
                   (('assumes', 'that', 'he',
                     'will', 'stay', 'in', 'the', 'house'),
                    ('geht', 'davon', 'aus', ',', 'dass',
                     'er', 'im', 'haus', 'bleibt')),
                   (('he',), ('er',)),
                   (('he', 'will', 'stay', 'in', 'the', 'house'),
                    ('er', 'im', 'haus', 'bleibt')),
                   (('house',), ('haus',)),
                   (('in', 'the'), ('im',)),
                   (('in', 'the', 'house'), ('im', 'haus')),
                   (('michael',), ('michael',)),
                   (('michael', 'assumes'),
                    ('michael', 'geht', 'davon', 'aus')),
                   (('michael', 'assumes'),
                    ('michael', 'geht', 'davon', 'aus', ',')),
                   (('michael', 'assumes', 'that'),
                    ('michael', 'geht', 'davon', 'aus', ',', 'dass')),
                   (('michael', 'assumes', 'that', 'he'),
                    ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er')),
                   (('michael',
                     'assumes',
                     'that',
                     'he',
                     'will',
                     'stay',
                     'in',
                     'the',
                     'house'),
                    ('michael',
                     'geht',
                     'davon',
                     'aus',
                     ',',
                     'dass',
                     'er',
                     'im',
                     'haus',
                     'bleibt')),
                   (('that',), (',', 'dass')),
                   (('that',), ('dass',)),
                   (('that', 'he'), (',', 'dass', 'er')),
                   (('that', 'he'), ('dass', 'er')),
                   (('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
                    (',', 'dass', 'er', 'im', 'haus', 'bleibt')),
                   (('that', 'he', 'will', 'stay', 'in', 'the', 'house'),
                    ('dass', 'er', 'im', 'haus', 'bleibt')),
                   (('will', 'stay'), ('bleibt',)),
                   (('will', 'stay', 'in', 'the', 'house'),
                    ('im', 'haus', 'bleibt'))])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)

        # another test
        es, fs = ("私 は 先生 です".split(), "I am a teacher".split())
        sentenses = [("僕 は 男 です", "I am a man"),
                     ("私 は 女 です", "I am a girl"),
                     ("私 は 先生 です", "I am a teacher"),
                     ("彼女 は 先生 です", "She is a teacher"),
                     ("彼 は 先生 です", "He is a teacher"),
                     ]
        corpus = mkcorpus(sentenses)
        alignment = symmetrization(es, fs, corpus)
        ans = set([(('\xe3\x81\xaf',
                     '\xe5\x85\x88\xe7\x94\x9f',
                     '\xe3\x81\xa7\xe3\x81\x99'),
                    ('a', 'teacher')),
                   (('\xe5\x85\x88\xe7\x94\x9f',), ('teacher',)),
                   (('\xe7\xa7\x81',), ('I', 'am')),
                   (('\xe7\xa7\x81',
                     '\xe3\x81\xaf',
                     '\xe5\x85\x88\xe7\x94\x9f',
                     '\xe3\x81\xa7\xe3\x81\x99'),
                    ('I', 'am', 'a', 'teacher'))])
        self.assertEqual(phrase_extract(es, fs, alignment), ans)