def test_phrases(): from smt.utils.utility import mkcorpus from smt.phrase.word_alignment import symmetrization sentenses = [("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) es, fs = ("私 は 先生 です".split(), "I am a teacher".split()) alignment = symmetrization(es, fs, corpus) ext = phrase_extract(es, fs, alignment) ans = ("は 先生 です <-> a teacher", "先生 <-> teacher" "私 <-> I am" "私 は 先生 です <-> I am a teacher") for e, f in ext: print("{} {} {}".format(' '.join(e), "<->", ' '.join(f))) ## phrases fs = "I am a teacher".split() phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext]) print(phrases) ans = {((1, 'I'), (2, 'am')), ((1, 'I'), (2, 'am'), (3, 'a'), (4, 'teacher')), ((4, 'teacher'),), ((3, 'a'), (4, 'teacher'))} phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext]) assert ans == phrases
def test_phrases(): from smt.utils.utility import mkcorpus from smt.phrase.word_alignment import symmetrization sentenses = [ ("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) es, fs = ("私 は 先生 です".split(), "I am a teacher".split()) alignment = symmetrization(es, fs, corpus) ext = phrase_extract(es, fs, alignment) ans = ("は 先生 です <-> a teacher", "先生 <-> teacher" "私 <-> I am" "私 は 先生 です <-> I am a teacher") for e, f in ext: print("{} {} {}".format(' '.join(e), "<->", ' '.join(f))) ## phrases fs = "I am a teacher".split() phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext]) print(phrases) ans = {((1, 'I'), (2, 'am')), ((1, 'I'), (2, 'am'), (3, 'a'), (4, 'teacher')), ((4, 'teacher'), ), ((3, 'a'), (4, 'teacher'))} phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext]) assert ans == phrases
def test_symmetrization(self): sentenses = [("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) es = "私 は 先生 です".split() fs = "I am a teacher".split() syn = symmetrization(es, fs, corpus) ans = set([(1, 1), (1, 2), (2, 3), (3, 4), (4, 3)]) self.assertEqual(syn, ans)
def test_symmetrization(self): sentenses = [ ("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) es = "私 は 先生 です".split() fs = "I am a teacher".split() syn = symmetrization(es, fs, corpus) ans = set([(1, 1), (1, 2), (2, 3), (3, 4), (4, 3)]) self.assertEqual(syn, ans)
def test_phrase_extract(self): # next alignment matrix is like # # |x| | | | | | | | | | # | |x|x|x| | | | | | | # | | | | | |x| | | | | # | | | | | | |x| | | | # | | | | | | | | | |x| # | | | | | | | | | |x| # | | | | | | | |x| | | # | | | | | | | |x| | | # | | | | | | | | |x| | # es = "michael assumes that he will stay in the house".split() fs = "michael geht davon aus , dass er im haus bleibt".split() alignment = set([(1, 1), (2, 2), (2, 3), (2, 4), (3, 6), (4, 7), (5, 10), (6, 10), (7, 8), (8, 8), (9, 9)]) ans = set([ (('assumes', ), ('geht', 'davon', 'aus')), (('assumes', ), ('geht', 'davon', 'aus', ',')), (('assumes', 'that'), ('geht', 'davon', 'aus', ',', 'dass')), (('assumes', 'that', 'he'), ('geht', 'davon', 'aus', ',', 'dass', 'er')), (('assumes', 'that', 'he', 'will', 'stay', 'in', 'the', 'house'), ('geht', 'davon', 'aus', ',', 'dass', 'er', 'im', 'haus', 'bleibt')), (('he', ), ('er', )), (('he', 'will', 'stay', 'in', 'the', 'house'), ('er', 'im', 'haus', 'bleibt')), (('house', ), ('haus', )), (('in', 'the'), ('im', )), (('in', 'the', 'house'), ('im', 'haus')), (('michael', ), ('michael', )), (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus')), (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus', ',')), (('michael', 'assumes', 'that'), ('michael', 'geht', 'davon', 'aus', ',', 'dass')), (('michael', 'assumes', 'that', 'he'), ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er')), (('michael', 'assumes', 'that', 'he', 'will', 'stay', 'in', 'the', 'house'), ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er', 'im', 'haus', 'bleibt')), (('that', ), (',', 'dass')), (('that', ), ('dass', )), (('that', 'he'), (',', 'dass', 'er')), (('that', 'he'), ('dass', 'er')), (('that', 'he', 'will', 'stay', 'in', 'the', 'house'), (',', 'dass', 'er', 'im', 'haus', 'bleibt')), (('that', 'he', 'will', 'stay', 'in', 'the', 'house'), ('dass', 'er', 'im', 'haus', 'bleibt')), (('will', 'stay'), ('bleibt', )), (('will', 'stay', 'in', 'the', 'house'), ('im', 'haus', 'bleibt')) ]) self.assertEqual(phrase_extract(es, fs, alignment), ans) # another test es, fs = ("私 は 先生 です".split(), "I am a teacher".split()) sentenses = [ ("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) alignment = symmetrization(es, fs, corpus) ans = set([(('\xe3\x81\xaf', '\xe5\x85\x88\xe7\x94\x9f', '\xe3\x81\xa7\xe3\x81\x99'), ('a', 'teacher')), (('\xe5\x85\x88\xe7\x94\x9f', ), ('teacher', )), (('\xe7\xa7\x81', ), ('I', 'am')), (('\xe7\xa7\x81', '\xe3\x81\xaf', '\xe5\x85\x88\xe7\x94\x9f', '\xe3\x81\xa7\xe3\x81\x99'), ('I', 'am', 'a', 'teacher'))]) self.assertEqual(phrase_extract(es, fs, alignment), ans)
def test_phrase_extract(self): # next alignment matrix is like # # |x| | | | | | | | | | # | |x|x|x| | | | | | | # | | | | | |x| | | | | # | | | | | | |x| | | | # | | | | | | | | | |x| # | | | | | | | | | |x| # | | | | | | | |x| | | # | | | | | | | |x| | | # | | | | | | | | |x| | # es = "michael assumes that he will stay in the house".split() fs = "michael geht davon aus , dass er im haus bleibt".split() alignment = set([(1, 1), (2, 2), (2, 3), (2, 4), (3, 6), (4, 7), (5, 10), (6, 10), (7, 8), (8, 8), (9, 9)]) ans = set([(('assumes',), ('geht', 'davon', 'aus')), (('assumes',), ('geht', 'davon', 'aus', ',')), (('assumes', 'that'), ('geht', 'davon', 'aus', ',', 'dass')), (('assumes', 'that', 'he'), ('geht', 'davon', 'aus', ',', 'dass', 'er')), (('assumes', 'that', 'he', 'will', 'stay', 'in', 'the', 'house'), ('geht', 'davon', 'aus', ',', 'dass', 'er', 'im', 'haus', 'bleibt')), (('he',), ('er',)), (('he', 'will', 'stay', 'in', 'the', 'house'), ('er', 'im', 'haus', 'bleibt')), (('house',), ('haus',)), (('in', 'the'), ('im',)), (('in', 'the', 'house'), ('im', 'haus')), (('michael',), ('michael',)), (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus')), (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus', ',')), (('michael', 'assumes', 'that'), ('michael', 'geht', 'davon', 'aus', ',', 'dass')), (('michael', 'assumes', 'that', 'he'), ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er')), (('michael', 'assumes', 'that', 'he', 'will', 'stay', 'in', 'the', 'house'), ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er', 'im', 'haus', 'bleibt')), (('that',), (',', 'dass')), (('that',), ('dass',)), (('that', 'he'), (',', 'dass', 'er')), (('that', 'he'), ('dass', 'er')), (('that', 'he', 'will', 'stay', 'in', 'the', 'house'), (',', 'dass', 'er', 'im', 'haus', 'bleibt')), (('that', 'he', 'will', 'stay', 'in', 'the', 'house'), ('dass', 'er', 'im', 'haus', 'bleibt')), (('will', 'stay'), ('bleibt',)), (('will', 'stay', 'in', 'the', 'house'), ('im', 'haus', 'bleibt'))]) self.assertEqual(phrase_extract(es, fs, alignment), ans) # another test es, fs = ("私 は 先生 です".split(), "I am a teacher".split()) sentenses = [("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) alignment = symmetrization(es, fs, corpus) ans = set([(('\xe3\x81\xaf', '\xe5\x85\x88\xe7\x94\x9f', '\xe3\x81\xa7\xe3\x81\x99'), ('a', 'teacher')), (('\xe5\x85\x88\xe7\x94\x9f',), ('teacher',)), (('\xe7\xa7\x81',), ('I', 'am')), (('\xe7\xa7\x81', '\xe3\x81\xaf', '\xe5\x85\x88\xe7\x94\x9f', '\xe3\x81\xa7\xe3\x81\x99'), ('I', 'am', 'a', 'teacher'))]) self.assertEqual(phrase_extract(es, fs, alignment), ans)