コード例 #1
0
	def load_blogs(self):
		'''
		读取微博数据,为训练word2vec做准备
		'''
		self.load_stopwords()
		filepath = 'blogs.txt'
		dbhelper = DBHelper()
		i = 0
		size = 100000
		index = 0
		with open(filepath, 'a') as writer:
			while index < 227:				
				blogs = dbhelper.select('SELECT mc,rmc FROM microblog ORDER BY blog_id DESC LIMIT %s,%s' % (index*size, (index+1)*size))
				if not blogs:
					break
				for blog in blogs:
					print i
					i += 1
					msg = ''
					if blog[1]:
						msg = blog[1]
					elif blog[0]:
						msg = blog[0]
					tokens = jieba.cut(msg)
					tokens = [token for token in list(tokens) if token not in self.stopwords]
					if len(tokens) >= 5:
						writer.write(' '.join(tokens) + '\n')
				index += 1
コード例 #2
0
ファイル: main.py プロジェクト: rabintang/Recommendation
def expand_entry():
	'''
	扩展词条的背景材料
	'''
	dbhelper = DBHelper()
	searcher = Searcher()
	entrys = dbhelper.select("SELECT entryid,name,category FROM entry WHERE background IS NULL")
	for entry in entrys:
		print entry[1], entry[2]
		searcher.expand(entry[0], entry[1], entry[2])