@function_timer def t2wv(db) : return dict((attrib["title"], model[jieba.cut(text)]) for text, attrib in db) if __name__ == "__main__" : db = Database("data/zhwiki-extracted/", conditions = [ Database.cond_length(50), Database.cond_title(lambda t: not t.startswith("Wikipedia:")), Database.cond_title(lambda t: not t.startswith("File:")), Database.cond_title(lambda t: not t.startswith("Draft:")) ]) # 767125 loaded, 451657 filtered, 31997 fails model = Method.load("dump/lda-model.dump") title2topic = t2wv(db) pickle.dump(title2topic, open("dump/title2topic.dump", "w")) #title2wv = pickle.load(open("dump/title2wv.dump")) vectors = [] id2title = {} for i, (t, v) in enumerate(title2topic.items()) : vectors.append(v) id2title[i] = t fin = codecs.open("样例数据.txt", "r", "utf-8") lines = map(lambda l: l.strip(), fin.readlines()) fin.close() queries = map(lambda l: l[:l.find(u"?")], lines) import sys