def initContentUrl_dict(): """ @summary: 初始化去重列表 """ items = mysql.Mysql.queryContentUrl() for item in items: Cache.appendList(cache.oldContent_list, item[0])
def run(self): while not global_EXIT: url = "" try: website_id, url = Cache.getQueue(cache.freshContentUrl_queue, False) res = filterContentInfoFunc(website_id, url) if res == SpiderResType.success or res == SpiderResType.alreadyExist: Cache.appendList(cache.oldContent_list, url) else: Cache.setDict(cache.unrecognized_contentUrl_dict, url, website_id) except Exception as e: if type(e) is not queue.Empty: log.logMsg(LogType.error, "[FilterContentInfoThread] %s %s" % (url, traceback.format_exc()))
def putRecord(self, record): """ @summary: 把record添加到正在等待的网站队列中 """ website_id, website_url, xpath = record[:3] if not Cache.listItemExist(cache.workingWebsite_list, website_id) and \ not Cache.keyExist(cache.unrecognized_websiteUrl_dict, website_id): Cache.appendList(cache.workingWebsite_list, website_id) Cache.putQueue(cache.websiteUrl_queue, (website_id, website_url, xpath)) sleep_time = Cache.getDict(cache.websiteDelay_dict, website_id) for i in range(int(sleep_time)): if global_EXIT: return time.sleep(1) Cache.removeList(cache.workingWebsite_list, website_id)
def run(self): while not global_EXIT: url = "" try: url = Cache.randomKey(cache.unrecognized_contentUrl_dict) if url: website_id = Cache.getDict(cache.unrecognized_contentUrl_dict, url) res = filterContentInfoFunc(website_id, url) if res == SpiderResType.success or res == SpiderResType.alreadyExist: Cache.removeDict(cache.unrecognized_contentUrl_dict, url) Cache.appendList(cache.oldContent_list, url) for i in range(300): if global_EXIT: break time.sleep(1) except Exception as e: log.logMsg(LogType.error, "[FilterContentInfoThread.freshHandler] %s %s" % (url, traceback.format_exc()))