Ejemplo n.º 1
0
def work2():
    userData = pkl_load("userData.pkl")
    userData = [data for data in userData if data["user_source"] == 0]
    pkl_dump("userData.pkl", userData)

    date = [data["ref_date"] for data in userData]
    userNum = [data["cumulate_user"] for data in userData]
Ejemplo n.º 2
0
	def get_idfDict(self):
		wordsSum = Counter(iter_flat([list(set(words)) for newsID, words in self.fragments.items()]))

		monoWords = {word for word, freq in wordsSum.items() if freq <= 1} # 只出现一次的词
		pkl_dump(self.Mono_Words_File, monoWords)

		newsCount = len(self.fragments)
		idfDict = {word: math.log(newsCount/freq) for word, freq in wordsSum.items()}
		pkl_dump(self.IDF_Dict_File, idfDict)
Ejemplo n.º 3
0
	def get_fragments(self, fromCache=False):
		if fromCache:
			return
		else:
			with SQLiteDB() as newsDB:
				newsContents = newsDB.select("newsContent",("newsID","content")).fetchall()
				newsContents = [news for news in newsContents if news["newsID"] not in self.discard_newsIDs]
			fragments = {news["newsID"]: self.lcut(news["content"]) for news in show_status(newsContents, "cut news")}
			pkl_dump(self.Fragments_File, fragments)
Ejemplo n.º 4
0
	def get_binarization(self):
		binarize = dict()
		wordsList = [line["word"] for line in read_csv("word_frequency.csv")]
		wordsSet = frozenset(wordsList)
		for newsID, words in show_status(self.wordFrags.items(),"get binarization"):
			binarize[newsID] = {key: 0 for key in wordsList}
			for word in words:
				if word in wordsSet:
					binarize[newsID][word] = 1
		binarize = {newsID: np.array([freqs[word] for word in wordsList]) for newsID,freqs in binarize.items()}
		pkl_dump("wordsList.pkl",wordsList)
		pkl_dump("binarize.pkl",binarize)
Ejemplo n.º 5
0
	def update_table_newsInfo(self, method="update", fromCache=False):
		"""构造群发图文信息表"""
		try:
			if not fromCache:
				logger.info("Getting newsInfo...")
				totalNewsInfo = WxSpider().batchget_newsInfo()
				pkl_dump(self.Cache_NewsInfo, totalNewsInfo)
			else: #从本地获取
				totalNewsInfo = pkl_load(self.Cache_NewsInfo)

			fields = {"newsID","appmsgid","idx","sn","title","cover","content_url","like_num","read_num","masssend_time"}
			newsDicts = []

			for msgInfo in totalNewsInfo:
				if msgInfo["type"] != 9: continue #type=9代表图文信息,非图文信息直接跳过

				masssend_time = msgInfo["sent_info"]["time"]

				for newsInfo in msgInfo["appmsg_info"]:
					if newsInfo["is_deleted"] or not len({"comment_id","copyright_type"} & newsInfo.keys()):
						continue #说明被删掉了,直接跳过
					news = {k:v for k,v in newsInfo.items() if k in fields}
					for k,v in parse_qs(urlparse(newsInfo["content_url"]).query).items():
						if k in ("idx","itemidx"):
							news.update({"idx": v[0]})
						if k in ("sn","sign"):
							news.update({"sn": v[0]})
					news.update({"newsID": "{appmsgid:0>10d}{idx}".format(**news)})
					news.update({"masssend_time": datetime.fromtimestamp(masssend_time)})
					newsDicts.append(news)

			if method == "rebuild":
				self.insert_many("newsInfo", newsDicts)
				logger.info("Table newsInfo Create Success !")
			elif method == "update":
				'''oldNewsIDs = set(self.get_newsIDs())
				nowNewsIDs = set(news["newsID"] for news in newsDicts)

				new = nowNewsIDs - oldNewsIDs # 新发的文章
				self.insert_many("newsInfo", [news for news in newsDicts if news["newsID"] in new])

				delete = oldNewsIDs - nowNewsIDs # 删除的文章
				for newsID in delete:
					pass'''
				self.insert_many("newsInfo", newsDicts) # newsInfo 表中的阅读量是每天更新的,所以应该全部覆盖
				logger.info("Table newsInfo Update Success !")
			else:
				raise ValueError("unexpected method '%s' !" % method)

		except Exception as err:
			raise err
Ejemplo n.º 6
0
	def _cut_words(self, fromCache=True):
		if fromCache:
			wordFrags = pkl_load("wordFrags.pkl")
		else:
			wordFragsList = list()
			with DataBase() as db:
				newsID, newsData = db.get_news()
			jieba.enable_parallel(4)
			for news in show_status(newsData,"cut words"):
				frags = jieba.cut(news, cut_all=False)
				words = [frag for frag in frags if (frag not in self.stopWords) \
							and (not frag.isspace() and (not frag.isdigit()))]
				wordFragsList.append(words)
			jieba.disable_parallel()
			wordFrags = dict(zip(newsID, wordFragsList))
			pkl_dump("wordFrags.pkl")
		return wordFrags
Ejemplo n.º 7
0
    def __get_token(self):
        try:
            respData = requests.get("https://api.weixin.qq.com/cgi-bin/token",
                                    params={
                                        "grant_type": "client_credential",
                                        "appid": self.__appId,
                                        "secret": self.__appSecret,
                                    }).json()
            if "errcode" not in respData:
                accessToken = respData["access_token"]
                pkl_dump(self.Access_Token_File, accessToken, log=False)
                self.leftTime = respData["expires_in"]
                logger.info("get accesstoken")
            else:
                self.leftTime = 7200
                logger.info(respData)

        except Exception as err:
            logger.error(err)
            raise err
Ejemplo n.º 8
0
def work1(begin,end):

    begin = TimeParser.toDate(begin)
    end = TimeParser.toDate(end)

    userData = []
    t = copy(begin)

    while t < end:
        print(t)
        begin_date, end_date = TimeParser.toStr(t), TimeParser.toStr(min(t+timedelta(7-1), end))
        if begin_date == end_date:
            break
        try:
            userData.extend(Datacube.get_user_cumulate(begin_date, end_date))
            t += timedelta(7)
        except AbnormalErrcode as err:
            break

    userData = [data for data in userData if data["user_source"] == 0]
    pkl_dump("userData.pkl", userData)
Ejemplo n.º 9
0
 def __update_token(self):
     self.logger.info('[%s] get access_token' % self.account)
     respJson = requests.get("https://api.weixin.qq.com/cgi-bin/token",
                             params={
                                 "grant_type": "client_credential",
                                 "appid": self.__appId,
                                 "secret": self.__appSecret,
                             }).json()
     if "errcode" not in respJson:
         self.__access_token = respJson['access_token']
         self.__expired = int(time.time(
         )) + respJson['expires_in'] - self.Critical_Time  # 减掉 一小节
         pkl_dump(cachedir,
                  self.Access_Token_File,
                  self.__access_token,
                  log=False)
         pkl_dump(cachedir,
                  self.Access_Token_Expired_File,
                  self.__expired,
                  log=False)
     else:
         self.logger.error(respJson)
         raise requests.HTTPError(respJson)
Ejemplo n.º 10
0
	def update_table_newsContent(self, method="update", fromCache=False):
		try:
			if method == "rebuild":
				if not fromCache:
					newsContents = WxSpider().batchget_newsContent(self.select("newsInfo", ("newsID","title","content_url")).fetchall())
					pkl_dump(self.Cache_NewsContent, newsContents)
				else:
					newsContents = pkl_load(self.Cache_NewsContent)
				self.insert_many("newsContent", newsContents)
				logger.info("Table newsContent Create Success !")
			elif method == "update":
				oldNewsIDs = set(self.single_cur.execute("SELECT newsID FROM newsContent").fetchall())
				nowNewsIDs = set(self.get_newsIDs())

				new = nowNewsIDs - oldNewsIDs # 新发的文章
				newsInfos = self.select("newsInfo", ("newsID","title","content_url")).fetchall()
				newsContents = WxSpider().batchget_newsContent([news for news in newsInfos if news["newsID"] in new])
				self.insert_many("newsContent", newsContents)
				logger.info("Table newsContent Update Success !")
			else:
				raise ValueError("unexpected method '%s' !" % method)

		except Exception as err:
			raise err
Ejemplo n.º 11
0
	def get_bins(self):
		keyWords = {newsID: self.extract(words)	for newsID, words in self.fragments.items()}
		pkl_dump(self.Key_Words_File, keyWords)

		uniqueKeyWords = list(set(iter_flat(keyWords.values())))
		pkl_dump(self.Key_Words_List_File, uniqueKeyWords)

		bins = {newsID:np.array([(word in words) for word in uniqueKeyWords])
			for newsID, words in show_status(keyWords.items(), "get bins")}

		pkl_dump(self.Bins_File, bins)