예제 #1
0
	def __init__ (self, config):
		PYSQLiteDB.__init__ (self, user_db = "user_zhengju.db")
		#~ print "db init"
		#~ Init caches
		self.select_cache = Cache ()
		self.select_all_cache = Cache()
		self.longest_length_cache = Cache()
		self.load_config(config)
예제 #2
0
class ZhengJuDB (PYSQLiteDB):
	"""phrase database that contains all phrases and phrases' pinyin"""
	def __init__ (self, config):
		PYSQLiteDB.__init__ (self, user_db = "user_zhengju.db")
		#~ print "db init"
		#~ Init caches
		self.select_cache = Cache ()
		self.select_all_cache = Cache()
		self.longest_length_cache = Cache()
		self.load_config(config)

	def load_config(self, config):
		self.large_charset = config.read ("/IMEngine/Python/ZhengJu/LargeCharset", False)
		
	def clear_cache(self):
		#~ pass
		self.select_cache.clear()
		self.select_all_cache.clear()
		self.longest_length_cache.clear()

	def add_phrase (self, pinyin, freq, database = "user_db", user_freq = 32):
		""" add phrase to database"""
		sqlstring = """INSERT INTO %s.py_phrase (ylen, y0, y1, y2, y3, yx, s0, s1, s2, s3, phrase, freq, user_freq)
			VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
		phrase = u"".join([i.char for i in pinyin])
		#~ print "commit", phrase
		record = [None, None, None, None, None, None, None, None, None, None, None, 0, None]
		record [YLEN] = len (pinyin)
		i = 0
		for p in pinyin[:4]:
			record[Y0 + i] = p.get_pinyin_id ()
			record[S0 + i] = p.get_sheng_mu_id ()
			i += 1
		if len(pinyin) > 4:
			record[YX] = "'".join (map (str, pinyin[4:]))
		record[PHRASE] = phrase
		record[FREQ]= freq
		record[USER_FREQ] = user_freq
		self.db.execute (sqlstring % (database, ), record)
		self.db.commit ()
		del self.select_cache[u"'".join (map (str, pinyin))]

	def clean_useless_phrase (self):
		"Remove phrase from user database"
		#~ print "remove", record[6]
		sql = "DELETE FROM user_db.py_phrase WHERE user_freq = 0"
		self.db.execute (sql)
		self.db.commit ()
		self.clear_cache()

	def remove_phrase (self, record, database = "user_db"):
		"Remove phrase from user database"
		#~ print "remove", record[6]
		sql = "DELETE FROM user_db.py_phrase WHERE ylen = ? AND y0 = ? AND phrase = ?"
		self.db.execute (sql, (record[YLEN], record[Y0], record[6]))
		self.db.commit ()
		self.clear_cache()
	
	def build_pinyin_condition(self, pys, length = None):
		sql_conditions = []
		i = 0
		for py in pys[:4]:
			sql_conditions.append ("s%d = %d" % (i, py.get_sheng_mu_id ()))
			if py.is_complete ():
				sql_conditions.append ("y%d = %d" % (i, py.get_pinyin_id ()))
			i += 1
		if len (pys) > 4:
			pp = lambda (x): x.get_pinyin() if x.is_complete () else x.get_pinyin() + "%"
			pattern = "'".join (map (pp, pys[4:]))
			if length == None:
				sql_conditions.append ("yx LIKE \"" + pattern + "%\"")
			else:
				sql_conditions.append ("yx LIKE \"" + pattern + "\"")
		return sql_conditions

	def get_longest_phrase_length(self , pylist):
		"""return the longest word starting with first two pinyin of pylist"""
		if len(pylist)<5:
			return len(pylist)
		pys=pylist[:4]
		pinyin_string = u"'".join (map (str, pys))
		result = self.longest_length_cache [pinyin_string]
		if result != None:
			return result
		candidates = self.select_words_by_pinyin_list_all(pys)
		if candidates:
			result = max([i[YLEN] for i in candidates])
		else:
			result = 3
		#~ tables_union = """( SELECT ylen FROM main.py_phrase WHERE %(conditions)s UNION ALL
		#~ SELECT ylen FROM user_db.py_phrase WHERE %(conditions)s )"""
		#~ sql_conditions = self.build_pinyin_condition(pys)
		#~ tables_union = tables_union % { "conditions" : " AND ".join (sql_conditions) }
		#~ sql_string = "SELECT Max(ylen) FROM " + tables_union + " ;"
		#~ print "start", time.time()
		#~ result = list (self.db.execute (sql_string).fetchall ())
		#~ if not result[0][0]:
			#~ result =  1
		#~ else:
			#~ result = result[0][0]
		self.longest_length_cache[pinyin_string] = result
		#~ print "longest",pinyin_string, result, time.time()
		if result > len(pylist):
			return len(pylist)
		return result

	def select_words_by_phrase (self, pys, database = None):
		return self.select_words_by_pinyin(pys, phrase=u"".join([i.char for i in pys]),database=database)
	
	def select_phrase (self, phrase, database = None):
		sql_conditions = "ylen =%d AND phrase = \"%s\"" % (len(phrase), phrase)
		if database:
			tables_union = "(SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM " + database+ ".py_phrase WHERE %(conditions)s) "
		else:
			tables_union = """ (SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM user_db.py_phrase WHERE %(conditions)s UNION ALL
							SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM main.py_phrase WHERE %(conditions)s) """		
		sql_prefix = "SELECT * FROM " + tables_union % { "conditions" : sql_conditions }
		sql_string = sql_prefix + " GROUP BY phrase ORDER BY freq;"
		result = list (self.db.execute (sql_string).fetchall ())
		return result

	def select_words_by_pinyin (self, pys, length=None, phrase=None, database = None):
		sql_conditions = []
		if length:
			sql_conditions = ["ylen = %d" % length ]
		if not self.large_charset:
			sql_conditions.append("freq > 0")
		if phrase:
			sql_conditions.append("phrase = \"%s" % phrase +"\"")
		if database:
			tables_union = "( SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM " + database+ ".py_phrase WHERE %(conditions)s )"
		else:
			tables_union = """( SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM user_db.py_phrase WHERE %(conditions)s UNION ALL
							SELECT ylen, y0, y1, y2, y3, yx, phrase, freq, user_freq FROM main.py_phrase WHERE %(conditions)s )"""		
		sql_conditions.extend (self.build_pinyin_condition (pys, length))
		tables_union = tables_union % { "conditions" : " AND ".join (sql_conditions) }
		sql_prefix = "SELECT ylen, y0, y1, y2, y3, yx, phrase, MAX(freq*ifnull(user_freq,1)) as adj_freq FROM " + tables_union
		sql_string = sql_prefix + " GROUP BY phrase ORDER BY adj_freq DESC;"
		#~ print "by pinyin"
		result = list (self.db.execute (sql_string).fetchall ())
		#~ print result
		return result

	def select_words_by_pinyin_list_all (self, pys, length = None):
		"""select words from database by list that contains PYDict.PinYinWord objects"""
		pinyin_string = u"'".join (map (str, pys))
		result = self.select_all_cache [pinyin_string]
		if result == None:
			result = self.select_words_by_pinyin(pys)
			self.select_all_cache[pinyin_string] = result
			#~ print "list_all",pinyin_string
		return result
	def remove_all_user_words(self):
		sql = "DELETE FROM user_db.py_phrase WHERE freq = " + str(USER_WORD)
		self.db.execute (sql)
		self.db.commit ()
	def remove_all_user_phrase(self):
		sql = "DELETE FROM user_db.py_phrase WHERE freq = " + str(USER_PHRASE)
		self.db.execute (sql)
		self.db.commit ()
	def remvoe_all_user_freq(self):
		sql = "DELETE FROM user_db.py_phrase WHERE freq != %d and freq != %d" (USER_PHRASE, USER_WORD)
		self.db.execute (sql)
		self.db.commit ()
	def select_words_by_pinyin_list (self, pys, database = None):
		"""select words from database by list that contains PYDict.PinYinWord objects"""
		pinyin_string = u"'".join (map (str, pys))
		result = self.select_cache [pinyin_string]
		if result == None:
			result = self.select_words_by_pinyin(pys, len(pys), database)
			self.select_cache[pinyin_string] = result
			#~ print "list",pinyin_string
		return result
	
	def adjust_phrase_freq (self, pylist):
		"""this function adjusts frequence of phrase in user database."""
		p = self.select_words_by_phrase(pylist,"user_db")
		if len(p)>0:
			q = self.select_words_by_pinyin(pylist, length = len(pylist), database = "main")
			if len(q) == 0 or p[0][7] > q[0][7]:
				sql_conditions = ["ylen = %d" % len(pylist) ]
				sql_conditions.extend (self.build_pinyin_condition (pylist, len(pylist)))
				sql_conditions.append ( "phrase != ?" )
				sql_conditions.append ( "user_freq * freq > " + str(p[0][7]) )
				sql = "UPDATE user_db.py_phrase SET user_freq = user_freq / 2 WHERE %(conditions)s" % { "conditions" : " AND ".join (sql_conditions) }
				self.db.execute (sql, [u"".join([i.char for i in pylist])])
				self.db.commit ()
			sql = "UPDATE user_db.py_phrase SET user_freq = user_freq * 2 WHERE phrase = ?"
			self.db.execute (sql,[u"".join([i.char for i in pylist])])
			self.db.commit ()
		else:
			p = self.select_words_by_phrase(pylist,"main")
			self.add_phrase(pylist,p[0][7],user_freq = 10)