Ejemplo n.º 1
0
Archivo: test.py Proyecto: cjx3721/QA
@function_timer
def t2wv(db) :
	return dict((attrib["title"], model[jieba.cut(text)]) for text, attrib in db)
	
if __name__ == "__main__" :
	db = Database("data/zhwiki-extracted/", conditions = [
		Database.cond_length(50), 
		Database.cond_title(lambda t: not t.startswith("Wikipedia:")),
		Database.cond_title(lambda t: not t.startswith("File:")),
		Database.cond_title(lambda t: not t.startswith("Draft:"))
	])
	# 767125 loaded, 451657 filtered, 31997 fails

	
	model = Method.load("dump/lda-model.dump")
	title2topic = t2wv(db)
	pickle.dump(title2topic, open("dump/title2topic.dump", "w"))
	#title2wv = pickle.load(open("dump/title2wv.dump"))
	vectors = []
	id2title = {}
	for i, (t, v) in enumerate(title2topic.items()) :
		vectors.append(v)
		id2title[i] = t
	
	fin = codecs.open("样例数据.txt", "r", "utf-8")
	lines = map(lambda l: l.strip(), fin.readlines())
	fin.close()
	queries = map(lambda l: l[:l.find(u"?")], lines)
	
	import sys