예제 #1
0
def main ():
	hanzi_dict = PYUtil.load_pinyin_table (open ("../pinyin_table.txt"))
	line_no = 0
	for line in sys.stdin:
		line = unicode (line, "utf8").strip ()
		strings = []
		pinyins = []
		pinyin = None
		for c in line:
			if c not in u"abcdefghijklmnopqrstuvwxyz":
				pinyins.append (pinyin)
				pinyin = None
				strings.append (c)
			else:
				if pinyin == None:
					pinyin = c
				else:
					pinyin = pinyin + c
		pinyins.append (pinyin)
		pinyins = pinyins[1:]
		
		try:
			for i in range (0, len (strings)):
				if pinyins[i] == None:
					c = strings[i]
					assert len (hanzi_dict[c]) == 1
					pinyins[i] = hanzi_dict[c].keys()[0]
		except:
			print >> sys.stderr, "%d : error!" % line_no
			continue

		output = u"%s\t%s" % (u"".join (strings), u"'".join (pinyins))
		print output.encode ("utf8")
		line_no += 1
예제 #2
0
def load_pinyin_table ():
	hanzi_dict = PYUtil.load_pinyin_table (open ("../../../../data/pinyin_table.txt"))
	tmp = {}
	for key, value in hanzi_dict.items ():
		pinyins = []
		for pinyin, freq in value.items ():
			pinyins.append (pinyin)
		tmp[key] = pinyins

	return tmp
예제 #3
0
def load_pinyin_table ():
	hanzi_dict = PYUtil.load_pinyin_table (open ("../../../../data/pinyin_table.txt"))
	tmp = {}
	for key, value in hanzi_dict.items ():
		pinyins = []
		for pinyin, freq in value.items ():
			pinyins.append ((pinyin, freq))
		pinyins.sort (key = lambda v: v[1], reverse = True)
		tmp[key] = pinyins

	return tmp
예제 #4
0
def main ():
	srcdir = "."
	if len (sys.argv) == 2:
		srcdir = sys.argv[1]

	# filename = "py.db"
	# try:
	# 	os.unlink (filename)
	# except:
	# 	pass
	# print "Load phrase freq data"
	#	freq_dict = {}
	#	for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
	#		l = unicode (l, "utf8")
	#		l = re.split (ur"\t+", l)
	#		freq_dict [l[0]] = int (l[1])
	#	
	#	print "Load char freq data"
	#	for l in file (os.path.join (srcdir, "CharFreq-Modern_utf8.txt")):
	#		l = unicode (l, "utf8")
	#		l = re.split (ur"\t+", l)
	#		freq_dict [l[0]] = int (l[2])
	# 
	# print "Create DB"
	# db = PYSQLiteDB.PYSQLiteDB (filename)
	# db.create_tables ()
	# db.init_pinyin_table ()
	# db.init_shengmu_table ()

	print "Load pinyin_table.txt.bz2"
	filename = os.path.join (srcdir, "pinyin_table.txt.bz2")
	bzf = bz2.BZ2File (filename, "r") 
	hanzi_dic = PYUtil.load_pinyin_table (bzf)

	print "Load SogouLabDic-utf8.dic"
	filename = os.path.join (srcdir, "SogouLabDic-utf8.dic")
	sogou_phrase = PYUtil.load_sogou_phrases (file (filename));

	print "Load qq_pinyin_1.0.txt.bz2"
	filename = os.path.join (srcdir, "qq_pinyin_1.0.txt.bz2")
	qq_phrases = load_qq_phrases (filename, hanzi_dic, sogou_phrase)
예제 #5
0
def main ():
	srcdir = "."
	if len (sys.argv) == 2:
		srcdir = sys.argv[1]

	filename = "py.db"
	try:
		os.unlink (filename)
	except:
		pass
	# print "Load phrase freq data"
	#	freq_dict = {}
	#	for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
	#		l = unicode (l, "utf8")
	#		l = re.split (ur"\t+", l)
	#		freq_dict [l[0]] = int (l[1])
	#	
	#	print "Load char freq data"
	#	for l in file (os.path.join (srcdir, "CharFreq-Modern_utf8.txt")):
	#		l = unicode (l, "utf8")
	#		l = re.split (ur"\t+", l)
	#		freq_dict [l[0]] = int (l[2])
	
	print "Create DB"
	db = PYSQLiteDB.PYSQLiteDB (filename)
	db.create_tables ()
	db.init_pinyin_table ()
	db.init_shengmu_table ()

	print "Load phrase_pinyin.txt.bz2"
	filename = os.path.join (srcdir, "phrase_pinyin.txt.bz2")
	bzf = bz2.BZ2File (filename, "r")
	phrases_dic = PYUtil.load_phrase_pinyin (bzf)
	
	# db.add_phrases (phrase_pinyin_parser (bzf))
	
	print "Load pinyin_table.txt.bz2"
	filename = os.path.join (srcdir, "pinyin_table.txt.bz2")
	bzf = bz2.BZ2File (filename, "r")
	hanzi_dic = PYUtil.load_pinyin_table (bzf)
		
	def print_phrase (phrase, phrase_orig, pinyins, freq):
		if not phrase:
			line = u"%s\t%s\t%d" % (phrase_orig, u"'".join (pinyins), freq)
			print line.encode ("utf-8")
			return
		if not hanzi_dic.has_key (phrase[0]):
			return
		for pinyin, f in hanzi_dic[phrase[0]].items ():
			print_phrase (phrase[1:], phrase_orig, pinyins + [pinyin], freq)
			

	for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
		w = unicode (l, "utf8")
		w = re.split (ur"\t+", w)
		if phrases_dic.has_key (w[0]):
			continue
			for phrase, pinyin, freq in phrases_dic [w[0]]:
				line = u"%s\t%s\t%d" % (phrase, pinyin, freq)
				print line.encode ("utf8")
			continue
		print_phrase (w[0], w[0], [], int (w[1]))

	return 0