def main (): hanzi_dict = PYUtil.load_pinyin_table (open ("../pinyin_table.txt")) line_no = 0 for line in sys.stdin: line = unicode (line, "utf8").strip () strings = [] pinyins = [] pinyin = None for c in line: if c not in u"abcdefghijklmnopqrstuvwxyz": pinyins.append (pinyin) pinyin = None strings.append (c) else: if pinyin == None: pinyin = c else: pinyin = pinyin + c pinyins.append (pinyin) pinyins = pinyins[1:] try: for i in range (0, len (strings)): if pinyins[i] == None: c = strings[i] assert len (hanzi_dict[c]) == 1 pinyins[i] = hanzi_dict[c].keys()[0] except: print >> sys.stderr, "%d : error!" % line_no continue output = u"%s\t%s" % (u"".join (strings), u"'".join (pinyins)) print output.encode ("utf8") line_no += 1
def load_pinyin_table (): hanzi_dict = PYUtil.load_pinyin_table (open ("../../../../data/pinyin_table.txt")) tmp = {} for key, value in hanzi_dict.items (): pinyins = [] for pinyin, freq in value.items (): pinyins.append (pinyin) tmp[key] = pinyins return tmp
def load_pinyin_table (): hanzi_dict = PYUtil.load_pinyin_table (open ("../../../../data/pinyin_table.txt")) tmp = {} for key, value in hanzi_dict.items (): pinyins = [] for pinyin, freq in value.items (): pinyins.append ((pinyin, freq)) pinyins.sort (key = lambda v: v[1], reverse = True) tmp[key] = pinyins return tmp
def main (): srcdir = "." if len (sys.argv) == 2: srcdir = sys.argv[1] # filename = "py.db" # try: # os.unlink (filename) # except: # pass # print "Load phrase freq data" # freq_dict = {} # for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")): # l = unicode (l, "utf8") # l = re.split (ur"\t+", l) # freq_dict [l[0]] = int (l[1]) # # print "Load char freq data" # for l in file (os.path.join (srcdir, "CharFreq-Modern_utf8.txt")): # l = unicode (l, "utf8") # l = re.split (ur"\t+", l) # freq_dict [l[0]] = int (l[2]) # # print "Create DB" # db = PYSQLiteDB.PYSQLiteDB (filename) # db.create_tables () # db.init_pinyin_table () # db.init_shengmu_table () print "Load pinyin_table.txt.bz2" filename = os.path.join (srcdir, "pinyin_table.txt.bz2") bzf = bz2.BZ2File (filename, "r") hanzi_dic = PYUtil.load_pinyin_table (bzf) print "Load SogouLabDic-utf8.dic" filename = os.path.join (srcdir, "SogouLabDic-utf8.dic") sogou_phrase = PYUtil.load_sogou_phrases (file (filename)); print "Load qq_pinyin_1.0.txt.bz2" filename = os.path.join (srcdir, "qq_pinyin_1.0.txt.bz2") qq_phrases = load_qq_phrases (filename, hanzi_dic, sogou_phrase)
def main (): srcdir = "." if len (sys.argv) == 2: srcdir = sys.argv[1] filename = "py.db" try: os.unlink (filename) except: pass # print "Load phrase freq data" # freq_dict = {} # for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")): # l = unicode (l, "utf8") # l = re.split (ur"\t+", l) # freq_dict [l[0]] = int (l[1]) # # print "Load char freq data" # for l in file (os.path.join (srcdir, "CharFreq-Modern_utf8.txt")): # l = unicode (l, "utf8") # l = re.split (ur"\t+", l) # freq_dict [l[0]] = int (l[2]) print "Create DB" db = PYSQLiteDB.PYSQLiteDB (filename) db.create_tables () db.init_pinyin_table () db.init_shengmu_table () print "Load phrase_pinyin.txt.bz2" filename = os.path.join (srcdir, "phrase_pinyin.txt.bz2") bzf = bz2.BZ2File (filename, "r") phrases_dic = PYUtil.load_phrase_pinyin (bzf) # db.add_phrases (phrase_pinyin_parser (bzf)) print "Load pinyin_table.txt.bz2" filename = os.path.join (srcdir, "pinyin_table.txt.bz2") bzf = bz2.BZ2File (filename, "r") hanzi_dic = PYUtil.load_pinyin_table (bzf) def print_phrase (phrase, phrase_orig, pinyins, freq): if not phrase: line = u"%s\t%s\t%d" % (phrase_orig, u"'".join (pinyins), freq) print line.encode ("utf-8") return if not hanzi_dic.has_key (phrase[0]): return for pinyin, f in hanzi_dic[phrase[0]].items (): print_phrase (phrase[1:], phrase_orig, pinyins + [pinyin], freq) for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")): w = unicode (l, "utf8") w = re.split (ur"\t+", w) if phrases_dic.has_key (w[0]): continue for phrase, pinyin, freq in phrases_dic [w[0]]: line = u"%s\t%s\t%d" % (phrase, pinyin, freq) print line.encode ("utf8") continue print_phrase (w[0], w[0], [], int (w[1])) return 0