コード例 #1
0
ファイル: tfidf.py プロジェクト: pkuyouth/PKUyouthWebServer
	def update(self):
		self.get_fragments(fromCache=False)
		self.fragments = pkl_load(self.Fragments_File)

		self.get_idfDict()
		self.idfDict = pkl_load(self.IDF_Dict_File)
		self.monoWords = pkl_load(self.Mono_Words_File)

		self.get_bins()
コード例 #2
0
def work2():
    userData = pkl_load("userData.pkl")
    userData = [data for data in userData if data["user_source"] == 0]
    pkl_dump("userData.pkl", userData)

    date = [data["ref_date"] for data in userData]
    userNum = [data["cumulate_user"] for data in userData]
コード例 #3
0
    def __init__(self, account):
        assert account in ('rabbitw', 'test', 'pkuyouth')
        self.account = account
        self.logger = Logger("%s.auth" % account, )

        self.Access_Token_File = "%s_accesstoken.pkl" % account
        self.Access_Token_Expired_File = "%s_accesstoken_expired.pkl" % account

        self.__appId = get_secret("%s_appID.pkl" % account)
        self.__appSecret = get_secret("%s_appSecret.pkl" % account)

        self.__access_token = pkl_load(cachedir,
                                       self.Access_Token_File,
                                       default='',
                                       log=False)
        self.__expired = pkl_load(cachedir,
                                  self.Access_Token_Expired_File,
                                  default=0,
                                  log=False)
コード例 #4
0
def get_tops(newsID, top=10):
	wordsList = pkl_load("wordsList.pkl")
	binarize = pkl_load("binarize.pkl")
	wordFrags = pkl_load("wordFrags.pkl")
	wordsSet = frozenset(wordsList)
	words = wordFrags[newsID]

	newsBin = {word: 0 for word in wordsList}
	for word in words:
		if word in wordsSet:
			newsBin[word] = 1
	thisBin = np.array([newsBin[word] for word in wordsList])

	tcs = dict()
	for _newsID, otherBin in binarize.items():
		dot = np.dot(thisBin, otherBin)
		Tc = np.sum(dot) / (np.sum(thisBin) + np.sum(otherBin) - np.sum(dot))
		if Tc not in {0,1}: # 去掉重发文和完全无关文
			tcs[_newsID] = Tc

	return list(sorted(tcs.items(), key=lambda item: item[1], reverse=True))[:top]
コード例 #5
0
	def update_table_newsInfo(self, method="update", fromCache=False):
		"""构造群发图文信息表"""
		try:
			if not fromCache:
				logger.info("Getting newsInfo...")
				totalNewsInfo = WxSpider().batchget_newsInfo()
				pkl_dump(self.Cache_NewsInfo, totalNewsInfo)
			else: #从本地获取
				totalNewsInfo = pkl_load(self.Cache_NewsInfo)

			fields = {"newsID","appmsgid","idx","sn","title","cover","content_url","like_num","read_num","masssend_time"}
			newsDicts = []

			for msgInfo in totalNewsInfo:
				if msgInfo["type"] != 9: continue #type=9代表图文信息,非图文信息直接跳过

				masssend_time = msgInfo["sent_info"]["time"]

				for newsInfo in msgInfo["appmsg_info"]:
					if newsInfo["is_deleted"] or not len({"comment_id","copyright_type"} & newsInfo.keys()):
						continue #说明被删掉了,直接跳过
					news = {k:v for k,v in newsInfo.items() if k in fields}
					for k,v in parse_qs(urlparse(newsInfo["content_url"]).query).items():
						if k in ("idx","itemidx"):
							news.update({"idx": v[0]})
						if k in ("sn","sign"):
							news.update({"sn": v[0]})
					news.update({"newsID": "{appmsgid:0>10d}{idx}".format(**news)})
					news.update({"masssend_time": datetime.fromtimestamp(masssend_time)})
					newsDicts.append(news)

			if method == "rebuild":
				self.insert_many("newsInfo", newsDicts)
				logger.info("Table newsInfo Create Success !")
			elif method == "update":
				'''oldNewsIDs = set(self.get_newsIDs())
				nowNewsIDs = set(news["newsID"] for news in newsDicts)

				new = nowNewsIDs - oldNewsIDs # 新发的文章
				self.insert_many("newsInfo", [news for news in newsDicts if news["newsID"] in new])

				delete = oldNewsIDs - nowNewsIDs # 删除的文章
				for newsID in delete:
					pass'''
				self.insert_many("newsInfo", newsDicts) # newsInfo 表中的阅读量是每天更新的,所以应该全部覆盖
				logger.info("Table newsInfo Update Success !")
			else:
				raise ValueError("unexpected method '%s' !" % method)

		except Exception as err:
			raise err
コード例 #6
0
	def _cut_words(self, fromCache=True):
		if fromCache:
			wordFrags = pkl_load("wordFrags.pkl")
		else:
			wordFragsList = list()
			with DataBase() as db:
				newsID, newsData = db.get_news()
			jieba.enable_parallel(4)
			for news in show_status(newsData,"cut words"):
				frags = jieba.cut(news, cut_all=False)
				words = [frag for frag in frags if (frag not in self.stopWords) \
							and (not frag.isspace() and (not frag.isdigit()))]
				wordFragsList.append(words)
			jieba.disable_parallel()
			wordFrags = dict(zip(newsID, wordFragsList))
			pkl_dump("wordFrags.pkl")
		return wordFrags
コード例 #7
0
def plt_show():
	binarize = pkl_load("binarize.pkl")

	results = list()
	for newsID_x, newsBin_x in show_status(binarize.items()):
		for newsID_y, newsBin_y in binarize.items():
			if newsID_x > newsID_y: # 减少一半运算量
				continue
			else:
				dot = np.dot(newsBin_x, newsBin_y)
				Tc = dot / (np.sum(newsBin_x) + np.sum(newsBin_y) - dot)
				#results.append((newsID_x,newsID_y,Tc))
				results.append(Tc)

	#results.sort(lambda item: item[2])
	results.sort(reverse=True)

	plt.plot(np.array(results))
	plt.show()
コード例 #8
0
	def update_table_newsContent(self, method="update", fromCache=False):
		try:
			if method == "rebuild":
				if not fromCache:
					newsContents = WxSpider().batchget_newsContent(self.select("newsInfo", ("newsID","title","content_url")).fetchall())
					pkl_dump(self.Cache_NewsContent, newsContents)
				else:
					newsContents = pkl_load(self.Cache_NewsContent)
				self.insert_many("newsContent", newsContents)
				logger.info("Table newsContent Create Success !")
			elif method == "update":
				oldNewsIDs = set(self.single_cur.execute("SELECT newsID FROM newsContent").fetchall())
				nowNewsIDs = set(self.get_newsIDs())

				new = nowNewsIDs - oldNewsIDs # 新发的文章
				newsInfos = self.select("newsInfo", ("newsID","title","content_url")).fetchall()
				newsContents = WxSpider().batchget_newsContent([news for news in newsInfos if news["newsID"] in new])
				self.insert_many("newsContent", newsContents)
				logger.info("Table newsContent Update Success !")
			else:
				raise ValueError("unexpected method '%s' !" % method)

		except Exception as err:
			raise err
コード例 #9
0
ファイル: tfidf.py プロジェクト: pkuyouth/PKUyouthWebServer
	def init_for_match(self):
		self.bins = pkl_load(self.Bins_File,log=False)
		# self.bins = {newsID: _bin.astype(np.int8) for newsID, _bin in self.bins.items()}
		return self
コード例 #10
0
ファイル: tfidf.py プロジェクト: pkuyouth/PKUyouthWebServer
	def init_for_update(self):
		self.stopWords = pkl_load(self.Stop_Words_File)
		return self
コード例 #11
0
 def get(cls):
     return pkl_load(cls.Access_Token_File, log=False)
コード例 #12
0
	def __init__(self):
		self.__appID = pkl_load(secretdir, "miniprogram_appID.pkl", log=False)
		self.__appSecret = pkl_load(secretdir, "miniprogram_appSecret.pkl", log=False)