def load_blogs(self): ''' 读取微博数据,为训练word2vec做准备 ''' self.load_stopwords() filepath = 'blogs.txt' dbhelper = DBHelper() i = 0 size = 100000 index = 0 with open(filepath, 'a') as writer: while index < 227: blogs = dbhelper.select('SELECT mc,rmc FROM microblog ORDER BY blog_id DESC LIMIT %s,%s' % (index*size, (index+1)*size)) if not blogs: break for blog in blogs: print i i += 1 msg = '' if blog[1]: msg = blog[1] elif blog[0]: msg = blog[0] tokens = jieba.cut(msg) tokens = [token for token in list(tokens) if token not in self.stopwords] if len(tokens) >= 5: writer.write(' '.join(tokens) + '\n') index += 1
def expand_entry(): ''' 扩展词条的背景材料 ''' dbhelper = DBHelper() searcher = Searcher() entrys = dbhelper.select("SELECT entryid,name,category FROM entry WHERE background IS NULL") for entry in entrys: print entry[1], entry[2] searcher.expand(entry[0], entry[1], entry[2])
def recommend(filepath): ''' 为所有用户进行推荐,并保存计算结果 ------------------------------- filepath: 保存推荐结果的文件路径 ------------------------------- return: {uid:[entryid,...],...} ''' predictions = {} dbhelper = DBHelper() uids = dbhelper.get_uids() recommender = LogitRegRecommender() for uid in uids: print 'predict', uid result = recommender.recommend(uid) predictions[uid] = result evaluator = Evaluator() precision,recall,F1 = evaluator.evaluate(predictions, filepath) print precision,recall,F1