def test_phrases(): from smt.utils.utility import mkcorpus from smt.phrase.word_alignment import symmetrization sentenses = [ ("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) es, fs = ("私 は 先生 です".split(), "I am a teacher".split()) alignment = symmetrization(es, fs, corpus) ext = phrase_extract(es, fs, alignment) ans = ("は 先生 です <-> a teacher", "先生 <-> teacher" "私 <-> I am" "私 は 先生 です <-> I am a teacher") for e, f in ext: print("{} {} {}".format(' '.join(e), "<->", ' '.join(f))) ## phrases fs = "I am a teacher".split() phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext]) print(phrases) ans = {((1, 'I'), (2, 'am')), ((1, 'I'), (2, 'am'), (3, 'a'), (4, 'teacher')), ((4, 'teacher'), ), ((3, 'a'), (4, 'teacher'))} phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext]) assert ans == phrases
def test_phrases(): from smt.utils.utility import mkcorpus from smt.phrase.word_alignment import symmetrization sentenses = [("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) es, fs = ("私 は 先生 です".split(), "I am a teacher".split()) alignment = symmetrization(es, fs, corpus) ext = phrase_extract(es, fs, alignment) ans = ("は 先生 です <-> a teacher", "先生 <-> teacher" "私 <-> I am" "私 は 先生 です <-> I am a teacher") for e, f in ext: print("{} {} {}".format(' '.join(e), "<->", ' '.join(f))) ## phrases fs = "I am a teacher".split() phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext]) print(phrases) ans = {((1, 'I'), (2, 'am')), ((1, 'I'), (2, 'am'), (3, 'a'), (4, 'teacher')), ((4, 'teacher'),), ((3, 'a'), (4, 'teacher'))} phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext]) assert ans == phrases
def test_symmetrization(self): sentenses = [("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) es = "私 は 先生 です".split() fs = "I am a teacher".split() syn = symmetrization(es, fs, corpus) ans = set([(1, 1), (1, 2), (2, 3), (3, 4), (4, 3)]) self.assertEqual(syn, ans)
def test_symmetrization(self): sentenses = [ ("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) es = "私 は 先生 です".split() fs = "I am a teacher".split() syn = symmetrization(es, fs, corpus) ans = set([(1, 1), (1, 2), (2, 3), (3, 4), (4, 3)]) self.assertEqual(syn, ans)
def train(sentences, loop_count=1000): corpus = utility.mkcorpus(sentences) return _train(corpus, loop_count)
if __name__ == '__main__': # test2 from smt.utils.utility import mkcorpus from word_alignment import alignment from smt.ibmmodel import ibmmodel2 import sys delimiter = "," # load file which will be trained modelfd = open(sys.argv[1]) sentenses = [line.rstrip().split(delimiter) for line in modelfd.readlines()] # make corpus corpus = mkcorpus(sentenses) # train model from corpus f2e_train = ibmmodel2._train(corpus, loop_count=10) e2f_corpus = list(zip(*reversed(list(zip(*corpus))))) e2f_train = ibmmodel2._train(e2f_corpus, loop_count=10) # phrase extraction for line in sys.stdin: _es, _fs = line.rstrip().split(delimiter) es = _es.split() fs = _fs.split() f2e = ibmmodel2.viterbi_alignment(es, fs, *f2e_train).items() e2f = ibmmodel2.viterbi_alignment(fs, es, *e2f_train).items() align = alignment(es, fs, e2f, f2e) # symmetrized alignment
# (6, 10), # (7, 8), # (8, 8), # (9, 9)]) #pprint(phrase_extract(es, fs, alignment)) # test2 from smt.utils.utility import mkcorpus from word_alignment import symmetrization sentenses = [ ("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) es, fs = ("私 は 先生 です".split(), "I am a teacher".split()) alignment = symmetrization(es, fs, corpus) ext = phrase_extract(es, fs, alignment) pprint(ext) for e, f in ext: print(' '.join(e), "<->", ' '.join(f)) ## phrases fs = "I am a teacher".split() phrases = set([("I", "am"), ("a", "teacher"), ("teacher", ), ("I", "am", "a", "teacher")]) phrases = available_phrases(fs, [fs_ph for (es_ph, fs_ph) in ext]) print(phrases)
def train(self, loop_count): #for i, j in sentences: # print(i, j) # corpus = utility.mkcorpus(self.sentences) return self._train(corpus, loop_count)
def test_phrase_extract(self): # next alignment matrix is like # # |x| | | | | | | | | | # | |x|x|x| | | | | | | # | | | | | |x| | | | | # | | | | | | |x| | | | # | | | | | | | | | |x| # | | | | | | | | | |x| # | | | | | | | |x| | | # | | | | | | | |x| | | # | | | | | | | | |x| | # es = "michael assumes that he will stay in the house".split() fs = "michael geht davon aus , dass er im haus bleibt".split() alignment = set([(1, 1), (2, 2), (2, 3), (2, 4), (3, 6), (4, 7), (5, 10), (6, 10), (7, 8), (8, 8), (9, 9)]) ans = set([ (('assumes', ), ('geht', 'davon', 'aus')), (('assumes', ), ('geht', 'davon', 'aus', ',')), (('assumes', 'that'), ('geht', 'davon', 'aus', ',', 'dass')), (('assumes', 'that', 'he'), ('geht', 'davon', 'aus', ',', 'dass', 'er')), (('assumes', 'that', 'he', 'will', 'stay', 'in', 'the', 'house'), ('geht', 'davon', 'aus', ',', 'dass', 'er', 'im', 'haus', 'bleibt')), (('he', ), ('er', )), (('he', 'will', 'stay', 'in', 'the', 'house'), ('er', 'im', 'haus', 'bleibt')), (('house', ), ('haus', )), (('in', 'the'), ('im', )), (('in', 'the', 'house'), ('im', 'haus')), (('michael', ), ('michael', )), (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus')), (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus', ',')), (('michael', 'assumes', 'that'), ('michael', 'geht', 'davon', 'aus', ',', 'dass')), (('michael', 'assumes', 'that', 'he'), ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er')), (('michael', 'assumes', 'that', 'he', 'will', 'stay', 'in', 'the', 'house'), ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er', 'im', 'haus', 'bleibt')), (('that', ), (',', 'dass')), (('that', ), ('dass', )), (('that', 'he'), (',', 'dass', 'er')), (('that', 'he'), ('dass', 'er')), (('that', 'he', 'will', 'stay', 'in', 'the', 'house'), (',', 'dass', 'er', 'im', 'haus', 'bleibt')), (('that', 'he', 'will', 'stay', 'in', 'the', 'house'), ('dass', 'er', 'im', 'haus', 'bleibt')), (('will', 'stay'), ('bleibt', )), (('will', 'stay', 'in', 'the', 'house'), ('im', 'haus', 'bleibt')) ]) self.assertEqual(phrase_extract(es, fs, alignment), ans) # another test es, fs = ("私 は 先生 です".split(), "I am a teacher".split()) sentenses = [ ("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) alignment = symmetrization(es, fs, corpus) ans = set([(('\xe3\x81\xaf', '\xe5\x85\x88\xe7\x94\x9f', '\xe3\x81\xa7\xe3\x81\x99'), ('a', 'teacher')), (('\xe5\x85\x88\xe7\x94\x9f', ), ('teacher', )), (('\xe7\xa7\x81', ), ('I', 'am')), (('\xe7\xa7\x81', '\xe3\x81\xaf', '\xe5\x85\x88\xe7\x94\x9f', '\xe3\x81\xa7\xe3\x81\x99'), ('I', 'am', 'a', 'teacher'))]) self.assertEqual(phrase_extract(es, fs, alignment), ans)
def train(sentences, loop_count=1000): #for i, j in sentences: # print(i, j) corpus = utility.mkcorpus(sentences) return _train(corpus, loop_count)
def test_phrase_extract(self): # next alignment matrix is like # # |x| | | | | | | | | | # | |x|x|x| | | | | | | # | | | | | |x| | | | | # | | | | | | |x| | | | # | | | | | | | | | |x| # | | | | | | | | | |x| # | | | | | | | |x| | | # | | | | | | | |x| | | # | | | | | | | | |x| | # es = "michael assumes that he will stay in the house".split() fs = "michael geht davon aus , dass er im haus bleibt".split() alignment = set([(1, 1), (2, 2), (2, 3), (2, 4), (3, 6), (4, 7), (5, 10), (6, 10), (7, 8), (8, 8), (9, 9)]) ans = set([(('assumes',), ('geht', 'davon', 'aus')), (('assumes',), ('geht', 'davon', 'aus', ',')), (('assumes', 'that'), ('geht', 'davon', 'aus', ',', 'dass')), (('assumes', 'that', 'he'), ('geht', 'davon', 'aus', ',', 'dass', 'er')), (('assumes', 'that', 'he', 'will', 'stay', 'in', 'the', 'house'), ('geht', 'davon', 'aus', ',', 'dass', 'er', 'im', 'haus', 'bleibt')), (('he',), ('er',)), (('he', 'will', 'stay', 'in', 'the', 'house'), ('er', 'im', 'haus', 'bleibt')), (('house',), ('haus',)), (('in', 'the'), ('im',)), (('in', 'the', 'house'), ('im', 'haus')), (('michael',), ('michael',)), (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus')), (('michael', 'assumes'), ('michael', 'geht', 'davon', 'aus', ',')), (('michael', 'assumes', 'that'), ('michael', 'geht', 'davon', 'aus', ',', 'dass')), (('michael', 'assumes', 'that', 'he'), ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er')), (('michael', 'assumes', 'that', 'he', 'will', 'stay', 'in', 'the', 'house'), ('michael', 'geht', 'davon', 'aus', ',', 'dass', 'er', 'im', 'haus', 'bleibt')), (('that',), (',', 'dass')), (('that',), ('dass',)), (('that', 'he'), (',', 'dass', 'er')), (('that', 'he'), ('dass', 'er')), (('that', 'he', 'will', 'stay', 'in', 'the', 'house'), (',', 'dass', 'er', 'im', 'haus', 'bleibt')), (('that', 'he', 'will', 'stay', 'in', 'the', 'house'), ('dass', 'er', 'im', 'haus', 'bleibt')), (('will', 'stay'), ('bleibt',)), (('will', 'stay', 'in', 'the', 'house'), ('im', 'haus', 'bleibt'))]) self.assertEqual(phrase_extract(es, fs, alignment), ans) # another test es, fs = ("私 は 先生 です".split(), "I am a teacher".split()) sentenses = [("僕 は 男 です", "I am a man"), ("私 は 女 です", "I am a girl"), ("私 は 先生 です", "I am a teacher"), ("彼女 は 先生 です", "She is a teacher"), ("彼 は 先生 です", "He is a teacher"), ] corpus = mkcorpus(sentenses) alignment = symmetrization(es, fs, corpus) ans = set([(('\xe3\x81\xaf', '\xe5\x85\x88\xe7\x94\x9f', '\xe3\x81\xa7\xe3\x81\x99'), ('a', 'teacher')), (('\xe5\x85\x88\xe7\x94\x9f',), ('teacher',)), (('\xe7\xa7\x81',), ('I', 'am')), (('\xe7\xa7\x81', '\xe3\x81\xaf', '\xe5\x85\x88\xe7\x94\x9f', '\xe3\x81\xa7\xe3\x81\x99'), ('I', 'am', 'a', 'teacher'))]) self.assertEqual(phrase_extract(es, fs, alignment), ans)