def update(self): self.get_fragments(fromCache=False) self.fragments = pkl_load(self.Fragments_File) self.get_idfDict() self.idfDict = pkl_load(self.IDF_Dict_File) self.monoWords = pkl_load(self.Mono_Words_File) self.get_bins()
def work2(): userData = pkl_load("userData.pkl") userData = [data for data in userData if data["user_source"] == 0] pkl_dump("userData.pkl", userData) date = [data["ref_date"] for data in userData] userNum = [data["cumulate_user"] for data in userData]
def __init__(self, account): assert account in ('rabbitw', 'test', 'pkuyouth') self.account = account self.logger = Logger("%s.auth" % account, ) self.Access_Token_File = "%s_accesstoken.pkl" % account self.Access_Token_Expired_File = "%s_accesstoken_expired.pkl" % account self.__appId = get_secret("%s_appID.pkl" % account) self.__appSecret = get_secret("%s_appSecret.pkl" % account) self.__access_token = pkl_load(cachedir, self.Access_Token_File, default='', log=False) self.__expired = pkl_load(cachedir, self.Access_Token_Expired_File, default=0, log=False)
def get_tops(newsID, top=10): wordsList = pkl_load("wordsList.pkl") binarize = pkl_load("binarize.pkl") wordFrags = pkl_load("wordFrags.pkl") wordsSet = frozenset(wordsList) words = wordFrags[newsID] newsBin = {word: 0 for word in wordsList} for word in words: if word in wordsSet: newsBin[word] = 1 thisBin = np.array([newsBin[word] for word in wordsList]) tcs = dict() for _newsID, otherBin in binarize.items(): dot = np.dot(thisBin, otherBin) Tc = np.sum(dot) / (np.sum(thisBin) + np.sum(otherBin) - np.sum(dot)) if Tc not in {0,1}: # 去掉重发文和完全无关文 tcs[_newsID] = Tc return list(sorted(tcs.items(), key=lambda item: item[1], reverse=True))[:top]
def update_table_newsInfo(self, method="update", fromCache=False): """构造群发图文信息表""" try: if not fromCache: logger.info("Getting newsInfo...") totalNewsInfo = WxSpider().batchget_newsInfo() pkl_dump(self.Cache_NewsInfo, totalNewsInfo) else: #从本地获取 totalNewsInfo = pkl_load(self.Cache_NewsInfo) fields = {"newsID","appmsgid","idx","sn","title","cover","content_url","like_num","read_num","masssend_time"} newsDicts = [] for msgInfo in totalNewsInfo: if msgInfo["type"] != 9: continue #type=9代表图文信息,非图文信息直接跳过 masssend_time = msgInfo["sent_info"]["time"] for newsInfo in msgInfo["appmsg_info"]: if newsInfo["is_deleted"] or not len({"comment_id","copyright_type"} & newsInfo.keys()): continue #说明被删掉了,直接跳过 news = {k:v for k,v in newsInfo.items() if k in fields} for k,v in parse_qs(urlparse(newsInfo["content_url"]).query).items(): if k in ("idx","itemidx"): news.update({"idx": v[0]}) if k in ("sn","sign"): news.update({"sn": v[0]}) news.update({"newsID": "{appmsgid:0>10d}{idx}".format(**news)}) news.update({"masssend_time": datetime.fromtimestamp(masssend_time)}) newsDicts.append(news) if method == "rebuild": self.insert_many("newsInfo", newsDicts) logger.info("Table newsInfo Create Success !") elif method == "update": '''oldNewsIDs = set(self.get_newsIDs()) nowNewsIDs = set(news["newsID"] for news in newsDicts) new = nowNewsIDs - oldNewsIDs # 新发的文章 self.insert_many("newsInfo", [news for news in newsDicts if news["newsID"] in new]) delete = oldNewsIDs - nowNewsIDs # 删除的文章 for newsID in delete: pass''' self.insert_many("newsInfo", newsDicts) # newsInfo 表中的阅读量是每天更新的,所以应该全部覆盖 logger.info("Table newsInfo Update Success !") else: raise ValueError("unexpected method '%s' !" % method) except Exception as err: raise err
def _cut_words(self, fromCache=True): if fromCache: wordFrags = pkl_load("wordFrags.pkl") else: wordFragsList = list() with DataBase() as db: newsID, newsData = db.get_news() jieba.enable_parallel(4) for news in show_status(newsData,"cut words"): frags = jieba.cut(news, cut_all=False) words = [frag for frag in frags if (frag not in self.stopWords) \ and (not frag.isspace() and (not frag.isdigit()))] wordFragsList.append(words) jieba.disable_parallel() wordFrags = dict(zip(newsID, wordFragsList)) pkl_dump("wordFrags.pkl") return wordFrags
def plt_show(): binarize = pkl_load("binarize.pkl") results = list() for newsID_x, newsBin_x in show_status(binarize.items()): for newsID_y, newsBin_y in binarize.items(): if newsID_x > newsID_y: # 减少一半运算量 continue else: dot = np.dot(newsBin_x, newsBin_y) Tc = dot / (np.sum(newsBin_x) + np.sum(newsBin_y) - dot) #results.append((newsID_x,newsID_y,Tc)) results.append(Tc) #results.sort(lambda item: item[2]) results.sort(reverse=True) plt.plot(np.array(results)) plt.show()
def update_table_newsContent(self, method="update", fromCache=False): try: if method == "rebuild": if not fromCache: newsContents = WxSpider().batchget_newsContent(self.select("newsInfo", ("newsID","title","content_url")).fetchall()) pkl_dump(self.Cache_NewsContent, newsContents) else: newsContents = pkl_load(self.Cache_NewsContent) self.insert_many("newsContent", newsContents) logger.info("Table newsContent Create Success !") elif method == "update": oldNewsIDs = set(self.single_cur.execute("SELECT newsID FROM newsContent").fetchall()) nowNewsIDs = set(self.get_newsIDs()) new = nowNewsIDs - oldNewsIDs # 新发的文章 newsInfos = self.select("newsInfo", ("newsID","title","content_url")).fetchall() newsContents = WxSpider().batchget_newsContent([news for news in newsInfos if news["newsID"] in new]) self.insert_many("newsContent", newsContents) logger.info("Table newsContent Update Success !") else: raise ValueError("unexpected method '%s' !" % method) except Exception as err: raise err
def init_for_match(self): self.bins = pkl_load(self.Bins_File,log=False) # self.bins = {newsID: _bin.astype(np.int8) for newsID, _bin in self.bins.items()} return self
def init_for_update(self): self.stopWords = pkl_load(self.Stop_Words_File) return self
def get(cls): return pkl_load(cls.Access_Token_File, log=False)
def __init__(self): self.__appID = pkl_load(secretdir, "miniprogram_appID.pkl", log=False) self.__appSecret = pkl_load(secretdir, "miniprogram_appSecret.pkl", log=False)