Example #1
0
 def run(self):
     while not global_EXIT:
         url = ""
         try:
             url = Cache.randomKey(cache.unrecognized_contentUrl_dict)
             if url:
                 website_id = Cache.getDict(cache.unrecognized_contentUrl_dict, url)
                 res = filterContentInfoFunc(website_id, url)
                 if res == SpiderResType.success or res == SpiderResType.alreadyExist:
                     Cache.removeDict(cache.unrecognized_contentUrl_dict, url)
                     Cache.appendList(cache.oldContent_list, url)
             for i in range(300):
                 if global_EXIT: break
                 time.sleep(1)
         except Exception as e:
             log.logMsg(LogType.error, "[FilterContentInfoThread.freshHandler] %s %s" % (url, traceback.format_exc()))
Example #2
0
    def run(self):
        while not global_EXIT:
            website_url = ""
            if not Cache.dempty(cache.unrecognized_websiteUrl_dict):
                try:
                    website_id = Cache.randomKey(cache.unrecognized_websiteUrl_dict)
                    if not website_id:
                        for i in range(30):
                            if global_EXIT: break
                            time.sleep(1)
                            continue

                    website_url, xpath = Cache.getDict(cache.unrecognized_websiteUrl_dict, website_id)
                    if (website_id, website_url, xpath):
                        Cache.removeDict(cache.unrecognized_websiteUrl_dict, website_id)

                except Exception as e:
                    log.logMsg(LogType.error, "[FilterContentUrlThread.unrecognizedHandler] %s %s" % (website_url, traceback.format_exc()))