def filterContentUrlFunc(website_id, website_url, xpath): """ @summary: 筛选出网站的内容url """ try: spiderRes = Spider().chromedriver(website_url) html_selector = spiderRes.selector if html_selector is None: log.logMsg(LogType.htmlSelectorNone, "[FilterContentUrlThread] %s %s" % (website_url, "html_selector is None.")) return False hrefs = filterHrefs(website_url, xpath, html_selector) if len(hrefs) == 0: return False flag = False for href in hrefs: if not Cache.listItemExist(cache.oldContent_list, href) and \ not Cache.listItemExist(cache.unrecognized_contentUrl_dict, href): Cache.putQueue(cache.freshContentUrl_queue, (website_id, href)) flag = True if not flag: # 如果没有新数据,则延迟15分钟的爬取时间 incrDelay_time(website_id, 900) return True except Exception as e: log.logMsg(LogType.error, "[FilterContentUrlThread] %s %s" % (website_url, traceback.format_exc())) return False
def putRecord(self, record): """ @summary: 把record添加到正在等待的网站队列中 """ website_id, website_url, xpath = record[:3] if not Cache.listItemExist(cache.workingWebsite_list, website_id) and \ not Cache.keyExist(cache.unrecognized_websiteUrl_dict, website_id): Cache.appendList(cache.workingWebsite_list, website_id) Cache.putQueue(cache.websiteUrl_queue, (website_id, website_url, xpath)) sleep_time = Cache.getDict(cache.websiteDelay_dict, website_id) for i in range(int(sleep_time)): if global_EXIT: return time.sleep(1) Cache.removeList(cache.workingWebsite_list, website_id)