def test_replacement(self):
        self.tests = [[]]
        for line in codecs.open(u"test.txt", u"r", u"utf8"):
            line = line.lstrip().rstrip()
#            波ダッシュ(U+301C)を全角チルダ(U+FF5E)に変更
            line = line.replace(u"\u301C", u"\uFF5E")

            if len(line)!=0 and not line.startswith(u"#"):
                items = line.split(u"\t")
                if len(items) != 3:
                    continue
                if len(self.tests[-1]) == 3:
                    self.tests[-1] += items[1:]
                else:
                    self.tests.append(items[0:])
        del self.tests[0]

        for (contained_symbols, eng, jpn, gold_new_eng, gold_new_jpn) in self.tests:
            new_eng, new_jpn, isOK = symbols.getReplaced(eng, jpn)
            print "--"
            print contained_symbols
            print "", eng, jpn
            print ">>", new_eng, new_jpn
            print "Gold>>", gold_new_eng, gold_new_jpn
            print "OK?", isOK
            if isOK:
                self.assertEqual(new_eng, gold_new_eng)
                self.assertEqual(new_jpn, gold_new_jpn)
Esempio n. 2
0
def convert(ifname, ofname):
    f_out_word = codecs.open( ofname+'word.jsons' , 'w', 'utf8')
    f_out_phrase = codecs.open( ofname+'phrase.jsons' , 'w', 'utf8')


    f  = codecs.open(opts.ifname, 'r', opts.encode)
    prev_word_entry = None
    wordid = 0 #TODO suppose the order is fixed?
    phraseid = 0
    words = {}

    prev_line = None
    for lid, line in enumerate(f):
        if line == prev_line:
            continue
        prev_line = line #renew

        if lid % 10000 == 0:
            sys.stderr.write("\r%10d" % lid)

        entry, pos, translations, memo, samples = lineparse(line.rstrip())
        if memo is None:
            memo = "-"

        #TODO how abotu 'acrylyl group' ???
        if (pos is not None): #品詞がある = 単語だと断定
            #前のエントリと同じ
#            if (prev_word_entry == entry):
                #'samples' には単語に関する情報が詰まっている
#                words[entry][u"info"][-1][u"samples"] += samples
#            else:
            if prev_word_entry is not None\
                    and (prev_word_entry != entry):
                out2file(f_out_word, words[prev_word_entry])

            if (entry not in words):
                wordid += 1
                words[entry] = {u"wordid" : wordid, u"entry" : entry, u"info":[]}


            info = {u"pos" : pos, u"translations" : translations, u"memo" : memo, u"samples":samples}
            words[entry][u"info"].append(info)
            prev_word_entry = entry


        #FIXME 「パターン」または「フレーズ」
        else:
            phraseid += 1

            eng = entry
            jpn = translations[0][u"translation"] #第1翻訳を使用

            #ラベルの処理
            labels = translations[0][u"labels"] #第1翻訳のラベルを使用
            if u"《one's ~》" in labels:
                eng = u"one's " + eng
            if u"《someone's ~》" in labels:
                eng = u"someone's " + eng

            new_eng, new_jpn, isOK = symbols.getReplaced(eng, jpn)

            outdic = {u"phraseid" : phraseid,
                    u"entry" : entry,
                    u"translations" : translations,
                    u"error" : not isOK,
                    u"eng" : new_eng,
                    u"jpn" : new_jpn,
                    u"labels" : labels,
                    u"memo" : memo,
                    u"samples" : samples,
                    }
            out2file(f_out_phrase, outdic)