def filterContentUrlFunc(website_id, website_url, xpath): """ @summary: 筛选出网站的内容url """ try: spiderRes = Spider().chromedriver(website_url) html_selector = spiderRes.selector if html_selector is None: log.logMsg(LogType.htmlSelectorNone, "[FilterContentUrlThread] %s %s" % (website_url, "html_selector is None.")) return False hrefs = filterHrefs(website_url, xpath, html_selector) if len(hrefs) == 0: return False flag = False for href in hrefs: if not Cache.listItemExist(cache.oldContent_list, href) and \ not Cache.listItemExist(cache.unrecognized_contentUrl_dict, href): Cache.putQueue(cache.freshContentUrl_queue, (website_id, href)) flag = True if not flag: # 如果没有新数据,则延迟15分钟的爬取时间 incrDelay_time(website_id, 900) return True except Exception as e: log.logMsg(LogType.error, "[FilterContentUrlThread] %s %s" % (website_url, traceback.format_exc())) return False
def putRecord(self, record): """ @summary: 把record添加到正在等待的网站队列中 """ website_id, website_url, xpath = record[:3] if not Cache.listItemExist(cache.workingWebsite_list, website_id) and \ not Cache.keyExist(cache.unrecognized_websiteUrl_dict, website_id): Cache.appendList(cache.workingWebsite_list, website_id) Cache.putQueue(cache.websiteUrl_queue, (website_id, website_url, xpath)) sleep_time = Cache.getDict(cache.websiteDelay_dict, website_id) for i in range(int(sleep_time)): if global_EXIT: return time.sleep(1) Cache.removeList(cache.workingWebsite_list, website_id)
def logMsg(logType, msg, website_id="", content_url=""): """ @summary: 把日志放到redis中(partialNone要放到数据库中) :param logType: 日志类型 :param msg: 日志内容 :param website_id: 网站id :param content_url: 内容url :return: """ if logType == LogType.error and msg: msg = "》Error《:%s" % msg elif logType == LogType.htmlSelectorNone or logType == LogType.partialNone: msg = "?Warning?:%s" % msg elif logType == LogType.success: msg = "【Success】:%s" % msg else: msg = "--Other--:%s" % msg if logType == LogType.partialNone: Mysql.writeWebsiteMsg(website_id, content_url) Cache.putQueue(cache.log_queue, msg)