def work2(): userData = pkl_load("userData.pkl") userData = [data for data in userData if data["user_source"] == 0] pkl_dump("userData.pkl", userData) date = [data["ref_date"] for data in userData] userNum = [data["cumulate_user"] for data in userData]
def get_idfDict(self): wordsSum = Counter(iter_flat([list(set(words)) for newsID, words in self.fragments.items()])) monoWords = {word for word, freq in wordsSum.items() if freq <= 1} # 只出现一次的词 pkl_dump(self.Mono_Words_File, monoWords) newsCount = len(self.fragments) idfDict = {word: math.log(newsCount/freq) for word, freq in wordsSum.items()} pkl_dump(self.IDF_Dict_File, idfDict)
def get_fragments(self, fromCache=False): if fromCache: return else: with SQLiteDB() as newsDB: newsContents = newsDB.select("newsContent",("newsID","content")).fetchall() newsContents = [news for news in newsContents if news["newsID"] not in self.discard_newsIDs] fragments = {news["newsID"]: self.lcut(news["content"]) for news in show_status(newsContents, "cut news")} pkl_dump(self.Fragments_File, fragments)
def get_binarization(self): binarize = dict() wordsList = [line["word"] for line in read_csv("word_frequency.csv")] wordsSet = frozenset(wordsList) for newsID, words in show_status(self.wordFrags.items(),"get binarization"): binarize[newsID] = {key: 0 for key in wordsList} for word in words: if word in wordsSet: binarize[newsID][word] = 1 binarize = {newsID: np.array([freqs[word] for word in wordsList]) for newsID,freqs in binarize.items()} pkl_dump("wordsList.pkl",wordsList) pkl_dump("binarize.pkl",binarize)
def update_table_newsInfo(self, method="update", fromCache=False): """构造群发图文信息表""" try: if not fromCache: logger.info("Getting newsInfo...") totalNewsInfo = WxSpider().batchget_newsInfo() pkl_dump(self.Cache_NewsInfo, totalNewsInfo) else: #从本地获取 totalNewsInfo = pkl_load(self.Cache_NewsInfo) fields = {"newsID","appmsgid","idx","sn","title","cover","content_url","like_num","read_num","masssend_time"} newsDicts = [] for msgInfo in totalNewsInfo: if msgInfo["type"] != 9: continue #type=9代表图文信息,非图文信息直接跳过 masssend_time = msgInfo["sent_info"]["time"] for newsInfo in msgInfo["appmsg_info"]: if newsInfo["is_deleted"] or not len({"comment_id","copyright_type"} & newsInfo.keys()): continue #说明被删掉了,直接跳过 news = {k:v for k,v in newsInfo.items() if k in fields} for k,v in parse_qs(urlparse(newsInfo["content_url"]).query).items(): if k in ("idx","itemidx"): news.update({"idx": v[0]}) if k in ("sn","sign"): news.update({"sn": v[0]}) news.update({"newsID": "{appmsgid:0>10d}{idx}".format(**news)}) news.update({"masssend_time": datetime.fromtimestamp(masssend_time)}) newsDicts.append(news) if method == "rebuild": self.insert_many("newsInfo", newsDicts) logger.info("Table newsInfo Create Success !") elif method == "update": '''oldNewsIDs = set(self.get_newsIDs()) nowNewsIDs = set(news["newsID"] for news in newsDicts) new = nowNewsIDs - oldNewsIDs # 新发的文章 self.insert_many("newsInfo", [news for news in newsDicts if news["newsID"] in new]) delete = oldNewsIDs - nowNewsIDs # 删除的文章 for newsID in delete: pass''' self.insert_many("newsInfo", newsDicts) # newsInfo 表中的阅读量是每天更新的,所以应该全部覆盖 logger.info("Table newsInfo Update Success !") else: raise ValueError("unexpected method '%s' !" % method) except Exception as err: raise err
def _cut_words(self, fromCache=True): if fromCache: wordFrags = pkl_load("wordFrags.pkl") else: wordFragsList = list() with DataBase() as db: newsID, newsData = db.get_news() jieba.enable_parallel(4) for news in show_status(newsData,"cut words"): frags = jieba.cut(news, cut_all=False) words = [frag for frag in frags if (frag not in self.stopWords) \ and (not frag.isspace() and (not frag.isdigit()))] wordFragsList.append(words) jieba.disable_parallel() wordFrags = dict(zip(newsID, wordFragsList)) pkl_dump("wordFrags.pkl") return wordFrags
def __get_token(self): try: respData = requests.get("https://api.weixin.qq.com/cgi-bin/token", params={ "grant_type": "client_credential", "appid": self.__appId, "secret": self.__appSecret, }).json() if "errcode" not in respData: accessToken = respData["access_token"] pkl_dump(self.Access_Token_File, accessToken, log=False) self.leftTime = respData["expires_in"] logger.info("get accesstoken") else: self.leftTime = 7200 logger.info(respData) except Exception as err: logger.error(err) raise err
def work1(begin,end): begin = TimeParser.toDate(begin) end = TimeParser.toDate(end) userData = [] t = copy(begin) while t < end: print(t) begin_date, end_date = TimeParser.toStr(t), TimeParser.toStr(min(t+timedelta(7-1), end)) if begin_date == end_date: break try: userData.extend(Datacube.get_user_cumulate(begin_date, end_date)) t += timedelta(7) except AbnormalErrcode as err: break userData = [data for data in userData if data["user_source"] == 0] pkl_dump("userData.pkl", userData)
def __update_token(self): self.logger.info('[%s] get access_token' % self.account) respJson = requests.get("https://api.weixin.qq.com/cgi-bin/token", params={ "grant_type": "client_credential", "appid": self.__appId, "secret": self.__appSecret, }).json() if "errcode" not in respJson: self.__access_token = respJson['access_token'] self.__expired = int(time.time( )) + respJson['expires_in'] - self.Critical_Time # 减掉 一小节 pkl_dump(cachedir, self.Access_Token_File, self.__access_token, log=False) pkl_dump(cachedir, self.Access_Token_Expired_File, self.__expired, log=False) else: self.logger.error(respJson) raise requests.HTTPError(respJson)
def update_table_newsContent(self, method="update", fromCache=False): try: if method == "rebuild": if not fromCache: newsContents = WxSpider().batchget_newsContent(self.select("newsInfo", ("newsID","title","content_url")).fetchall()) pkl_dump(self.Cache_NewsContent, newsContents) else: newsContents = pkl_load(self.Cache_NewsContent) self.insert_many("newsContent", newsContents) logger.info("Table newsContent Create Success !") elif method == "update": oldNewsIDs = set(self.single_cur.execute("SELECT newsID FROM newsContent").fetchall()) nowNewsIDs = set(self.get_newsIDs()) new = nowNewsIDs - oldNewsIDs # 新发的文章 newsInfos = self.select("newsInfo", ("newsID","title","content_url")).fetchall() newsContents = WxSpider().batchget_newsContent([news for news in newsInfos if news["newsID"] in new]) self.insert_many("newsContent", newsContents) logger.info("Table newsContent Update Success !") else: raise ValueError("unexpected method '%s' !" % method) except Exception as err: raise err
def get_bins(self): keyWords = {newsID: self.extract(words) for newsID, words in self.fragments.items()} pkl_dump(self.Key_Words_File, keyWords) uniqueKeyWords = list(set(iter_flat(keyWords.values()))) pkl_dump(self.Key_Words_List_File, uniqueKeyWords) bins = {newsID:np.array([(word in words) for word in uniqueKeyWords]) for newsID, words in show_status(keyWords.items(), "get bins")} pkl_dump(self.Bins_File, bins)