def test_replacement(self): self.tests = [[]] for line in codecs.open(u"test.txt", u"r", u"utf8"): line = line.lstrip().rstrip() # 波ダッシュ(U+301C)を全角チルダ(U+FF5E)に変更 line = line.replace(u"\u301C", u"\uFF5E") if len(line)!=0 and not line.startswith(u"#"): items = line.split(u"\t") if len(items) != 3: continue if len(self.tests[-1]) == 3: self.tests[-1] += items[1:] else: self.tests.append(items[0:]) del self.tests[0] for (contained_symbols, eng, jpn, gold_new_eng, gold_new_jpn) in self.tests: new_eng, new_jpn, isOK = symbols.getReplaced(eng, jpn) print "--" print contained_symbols print "", eng, jpn print ">>", new_eng, new_jpn print "Gold>>", gold_new_eng, gold_new_jpn print "OK?", isOK if isOK: self.assertEqual(new_eng, gold_new_eng) self.assertEqual(new_jpn, gold_new_jpn)
def convert(ifname, ofname): f_out_word = codecs.open( ofname+'word.jsons' , 'w', 'utf8') f_out_phrase = codecs.open( ofname+'phrase.jsons' , 'w', 'utf8') f = codecs.open(opts.ifname, 'r', opts.encode) prev_word_entry = None wordid = 0 #TODO suppose the order is fixed? phraseid = 0 words = {} prev_line = None for lid, line in enumerate(f): if line == prev_line: continue prev_line = line #renew if lid % 10000 == 0: sys.stderr.write("\r%10d" % lid) entry, pos, translations, memo, samples = lineparse(line.rstrip()) if memo is None: memo = "-" #TODO how abotu 'acrylyl group' ??? if (pos is not None): #品詞がある = 単語だと断定 #前のエントリと同じ # if (prev_word_entry == entry): #'samples' には単語に関する情報が詰まっている # words[entry][u"info"][-1][u"samples"] += samples # else: if prev_word_entry is not None\ and (prev_word_entry != entry): out2file(f_out_word, words[prev_word_entry]) if (entry not in words): wordid += 1 words[entry] = {u"wordid" : wordid, u"entry" : entry, u"info":[]} info = {u"pos" : pos, u"translations" : translations, u"memo" : memo, u"samples":samples} words[entry][u"info"].append(info) prev_word_entry = entry #FIXME 「パターン」または「フレーズ」 else: phraseid += 1 eng = entry jpn = translations[0][u"translation"] #第1翻訳を使用 #ラベルの処理 labels = translations[0][u"labels"] #第1翻訳のラベルを使用 if u"《one's ~》" in labels: eng = u"one's " + eng if u"《someone's ~》" in labels: eng = u"someone's " + eng new_eng, new_jpn, isOK = symbols.getReplaced(eng, jpn) outdic = {u"phraseid" : phraseid, u"entry" : entry, u"translations" : translations, u"error" : not isOK, u"eng" : new_eng, u"jpn" : new_jpn, u"labels" : labels, u"memo" : memo, u"samples" : samples, } out2file(f_out_phrase, outdic)