def __init__(self, tab_urls, depth, process_num = None): ''' @summary: --------- @param tab_urls: @param depth: @param process_num: 进程编号 --------- @result: ''' super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False self._tab_worker_status = 'news:worker_status' self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '')
def __init__(self): self._mysqldb = MysqlDB(**config.get('mysqldb')) self._redis = RedisDB(**config.get('redisdb')) self._task_root_key = config.get('spider').get( 'redis_task_cache_root_key') self._account_task_key = self._task_root_key + ':z_account_task' self._article_task_key = self._task_root_key + ':z_article_task' self._last_article_publish_time = self._task_root_key + ':h_last_article_publish_time' self._new_last_article_publish_time = self._task_root_key + ':h_new_last_article_publish_time' self._ignore_haved_crawl_today_article_account = config.get( 'spider').get('ignore_haved_crawl_today_article_account') self._monitor_interval = config.get('spider').get('monitor_interval') self._zombie_account_not_publish_article_days = config.get( 'spider').get('zombie_account_not_publish_article_days') self._spider_interval_min = config.get('spider').get( 'spider_interval').get('min_sleep_time') self._spider_interval_max = config.get('spider').get( 'spider_interval').get('max_sleep_time') self._spider_interval_max = config.get('spider').get( 'spider_interval').get('max_sleep_time') self._crawl_time_range = (config.get("spider").get("crawl_time_range") or "~").split('~')
def __init__(self, table_article='articles'): if not hasattr(self, '_table_article'): super(ArticleManager, self).__init__() self._thread_stop = False self._articles_deque = collections.deque() self._db = RedisDB() self._table_article = table_article
def monitor_proxies(): redis_0 = RedisDB() config = os.path.join(os.path.dirname(__file__) + '/../config.conf') redis_key = tools.get_conf_value(config, 'redis', 'redis_key') redis_key2 = tools.get_conf_value(config, 'redis', 'redis_key2') sum = redis_0.count(redis_key) sum2 = redis_0.count(redis_key2) log.debug("douban当前redis库中剩余ip总数:%d" % sum) log.debug("weibo当前redis库中剩余ip总数:%d" % sum2)
def __init__(self, table_url = 'urls'): if not hasattr(self,'_table_url'): super(UrlManager, self).__init__() self._thread_stop = False self._urls_deque = collections.deque() self._db = RedisDB() self._table_url = table_url self._table_url_dupefilter = self._table_url + '_dupefilter' self._table_url_end_depth_dupefilter = self._table_url + '_end_depth_dupefilter'
def __init__(self, table_article = 'articles'): if not hasattr(self,'_table_article'): super(ArticleManager, self).__init__() self._thread_stop = False self._articles_deque = collections.deque() self._db = RedisDB() self._table_article = table_article self._table_article_bloomfilter = table_article + '_bloomfilter' self._bloomfilter = BloomFilter(redis_obj = self._db, key = self._table_article_bloomfilter)
def random_proxy(): try: redis = RedisDB() ip_pools = redis.sget(table=redis_key, count=1) proxy = random.choice(ip_pools) proxies = { "http": proxy, "https": proxy, } except Exception as e: print(e) proxies = {} return proxies
class Detection(object): def __init__(self): self.redis = RedisDB() async def detection_proxy(self, proxy, semaphore): async with semaphore: con = aiohttp.TCPConnector(verify_ssl=False, family=socket.AF_INET, limit=60) async with aiohttp.ClientSession(connector=con) as session: try: test_proxy = "http://" + proxy log.debug("正在测试代理:" + test_proxy) async with session.get(TEST_URL, proxy=test_proxy, timeout=7) as response: html = await response.text() if response.status == 200 and '检测到有异常请求' not in html: log.debug("\n" + proxy + " 代理可用") else: self.redis.delete_value(redis_key, proxy) log.debug("已清除失效的代理:" + proxy) except Exception as e: self.redis.delete_value(redis_key, proxy) log.debug("\n" + proxy + ' 代理请求失败') log.debug("已清除失效的代理:" + proxy) def run(self): try: proxies = self.redis.get_all(redis_key) for i in range(0, len(proxies), BATCH_SIZE): test_proxies = proxies[i:i + BATCH_SIZE] self.main(test_proxies) except Exception as e: log.debug("测试发生错误", e.args) def main(self, test_proxies): semaphore = asyncio.Semaphore(5) loop = asyncio.get_event_loop() task = [ self.detection_proxy(proxy, semaphore) for proxy in test_proxies ] loop.run_until_complete(asyncio.wait(task))
def __init__(self, tab_urls, depth): super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth # or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int( tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int( tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int( tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False
class Detection(object): def __init__(self): self.redis = RedisDB() self.test_url = 'https://m.weibo.cn/' @tools.debug async def get_html(self, root_url, proxy, semaphore): try: test_proxy = "http://" + proxy log.debug("正在测试代理:" + test_proxy) async with semaphore: response = await requests.get(root_url, proxy=test_proxy, timeout=5) html = await response.text() return response, html except asyncio.TimeoutError as err: #log.debug(err) return [], [] @tools.debug async def run(self, content_info): semaphore = asyncio.Semaphore(10) try: response, html = await self.get_html(self.test_url, content_info, semaphore) if html and response: if response.status == 200 and '检测到有异常请求' not in html: log.debug("\n" + content_info + " 代理可用") else: self.redis.delete_value(redis_key, content_info) log.debug("已清除失效的代理:" + content_info) else: self.redis.delete_value(redis_key, content_info) log.debug("已清除失效的代理:" + content_info) except Exception as e: print(e) self.redis.delete_value(redis_key, content_info) log.debug("\n" + content_info + ' 代理请求失败') log.debug("已清除失效的代理:" + content_info) def doing_main(self): task_list = self.redis.get_all(redis_key) log.debug('数据库中IP总数{}'.format(len(task_list))) tasks = [asyncio.ensure_future(self.run(data)) for data in task_list] loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks, timeout=6))
class ArticleManager(threading.Thread, Singleton): def __init__(self, table_article='articles'): if not hasattr(self, '_table_article'): super(ArticleManager, self).__init__() self._thread_stop = False self._articles_deque = collections.deque() self._db = RedisDB() self._table_article = table_article def run(self): while not self._thread_stop: try: self.__add_article_to_db() except Exception as e: log.error(e) tools.delay_time(1) def stop(self): self._thread_stop = True def put_articles(self, article): self._articles_deque.append(article) def clear_article(self): ''' @summary: 删除redis里的数据 --------- --------- @result: ''' self._db.clear(self._table_article) def __add_article_to_db(self): article_list = [] while self._articles_deque: article = self._articles_deque.popleft() article_list.append(article) if len(article_list) > 100: log.debug('添加article到数据库') self._db.zadd(self._table_article, article_list) article_list = [] if article_list: log.debug('添加article到数据库') self._db.zadd(self._table_article, article_list)
class SyncArtice(threading.Thread): def __init__(self): super(SyncArtice, self).__init__() self._es = ES() self._redis = RedisDB() self._sync_count = 0 def run(self): is_show_tip = False while True: try: datas = self.get_data_from_redis(SYNC_STEP) if not datas: if not is_show_tip: print('\n{time} 无数据 休眠... '.format( time=tools.get_current_date())) is_show_tip = True elif self.add_data_to_es(datas): is_show_tip = False self._sync_count += len(datas) tools.print_one_line('已同步 %d 条数据' % self._sync_count) tools.delay_time(1) except Exception as e: log.error(e) def get_data_from_redis(self, count): datas = self._redis.sget('news:news_article', count=count) return_datas = [] for data in datas: data = eval(data) release_time = data.get('release_time') if release_time and len(release_time) == 19: return_datas.append(data) return return_datas def add_data_to_es(self, datas): return self._es.add_batch(datas, primary_key='uuid', table='news_article')
class UrlManager(threading.Thread, Singleton): def __init__(self, table_url = 'urls'): if not hasattr(self,'_table_url'): super(UrlManager, self).__init__() self._thread_stop = False self._urls_deque = collections.deque() self._db = RedisDB() self._table_url = table_url self._table_url_dupefilter = self._table_url + '_dupefilter' self._table_url_end_depth_dupefilter = self._table_url + '_end_depth_dupefilter' def run(self): while not self._thread_stop: try: self.__add_url_to_db() except Exception as e: log.error(e) tools.delay_time(1) def stop(self): self._thread_stop = True def put_urls(self, urls): urls = urls if isinstance(urls, list) else [urls] for url in urls: self._urls_deque.append(url) def get_urls_count(self): return len(self._urls_deque) def clear_url(self): ''' @summary: 删除redis里的数据 --------- --------- @result: ''' self._db.clear(self._table_url) self._db.clear(self._table_url_dupefilter) def __add_url_to_db(self): url_list = [] prioritys = [] while self._urls_deque: url = self._urls_deque.popleft() url_id = tools.get_sha1(url.get('url')) depth = url.get('depth', 0) max_depth = url.get('remark',{}).get('spider_depth', 0) # 为了获取每层数量,指纹url暂时采用zset,且先校验指纹url,后校验最后一层url,不需要获取每层url时建议采用set存储,且先校验最后一层url if depth == max_depth - 1: #最后一层 url单独放,之后不需要清空 if self._db.zadd(self._table_url_dupefilter, url_id, depth) and self._db.sadd(self._table_url_end_depth_dupefilter, url_id): url_list.append(url) prioritys.append(depth) elif self._db.zadd(self._table_url_dupefilter, url_id, depth): url_list.append(url) prioritys.append(depth) if len(url_list) > 100: log.debug('添加url到数据库') self._db.zadd(self._table_url, url_list, prioritys) url_list = [] prioritys = [] if url_list: log.debug('添加url到数据库') self._db.zadd(self._table_url, url_list, prioritys)
def __init__(self): self.redis = RedisDB() self.test_url = 'https://m.weibo.cn/'
from db.data_generator import DataGenerator from db.redisdb import RedisDB data_generator = DataGenerator() # Generating data data_generator.generate_data() # Retrieving data data = data_generator.get_data() # Create Redis instance redis = RedisDB() redis.set(data) print(redis.get("planes:0")) print(redis.get("planes:1")) # It is not necessary to indicate the "planes" prefix as all of them are "planes" # In order to lock a nested resource use ":"
class WechatService(): _db = OracleDB() _es = ES() _redisdb = RedisDB() _wechat_sogou = WechatSogou() _wechat_public_platform = WechatPublicPlatform() _todo_accounts = collections.deque() _rownum = 1 _is_done = False # 做完一轮 _is_all_done = False # 所有账号当日发布的消息均已爬取 # wechat_sogou 最后没被封的时间 _wechat_sogou_enable = True _wechat_sogou_last_unenable_time = tools.get_current_timestamp() # wechat_public_platform 最后没被封的时间 _wechat_public_platform_enable = True _wechat_public_platform_last_unenable_time = tools.get_current_timestamp() def __init__(self): pass def __load_todo_account(self): accounts = WechatService._redisdb.sget('wechat:account', count=1) for account in accounts: account = eval(account) WechatService._todo_accounts.append(account) def is_have_new_article(self, account_id, account_name, __biz): ''' @summary: 检查是否有新发布的文章 --------- @param account_id: @param __biz: --------- @result: ''' result = '' if WechatService._wechat_sogou_enable: # 搜狗微信可用 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: # 被封了 请求失败 记录下失败时间 WechatService._wechat_sogou_enable = False WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 搜狗微信停用时间超过24小时了 可重新尝试 elif tools.get_current_timestamp( ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 搜狗微信可用 WechatService._wechat_sogou_enable = True elif result == constance.NOT_UPDATE: pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: pass # 更新下可用时间 WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章 if not result or result == constance.VERIFICATION_CODE: if WechatService._wechat_public_platform_enable: # 微信公众平台可用 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 WechatService._wechat_public_platform_enable = False WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) elif tools.get_current_timestamp( ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 WechatService._wechat_public_platform_enable = True elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 pass # 更新下可用时间 WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) return result def get_next_account(self): ''' @summary: --------- --------- @result: 返回biz, 是否已做完一圈 (biz, True) ''' if not WechatService._todo_accounts: self.__load_todo_account() if not WechatService._todo_accounts: return None oralce_id, account_id, account_name, last_article_release_time, biz = WechatService._todo_accounts.popleft( ) next_account_id = account_id next_account_biz = biz next_account_name = account_name next_account = next_account_id, next_account_biz sql = "update TAB_IOPM_SITE t set t.spider_status=602 where t.biz = '%s'" % ( next_account_biz) WechatService._db.update(sql) return next_account def update_account_article_num(self, __biz): # 查询es 统计数量 # 今日 body = { "size": 0, "query": { "filtered": { "filter": { "range": { "record_time": { "gte": tools.get_current_date('%Y-%m-%d') + ' 00:00:00', "lte": tools.get_current_date('%Y-%m-%d') + ' 23:59:59' } } }, "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) today_msg = result.get('hits', {}).get('total', 0) # 历史总信息量 body = { "size": 0, "query": { "filtered": { "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) total_msg = result.get('hits', {}).get('total', 0) if total_msg: sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d, t.spider_status=603 where t.biz = '%s'" % ( today_msg, total_msg, __biz) else: sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.spider_status=603 where t.biz = '%s'" % ( today_msg, __biz) print(sql) WechatService._db.update(sql) def is_exist(self, table, data_id): if WechatService._es.get(table, data_id=data_id, doc_type=table): return True else: return False def add_article_info(self, article_info): ''' @summary: --------- @param article_info: --------- @result: ''' log.debug(''' -----文章信息----- 标题 %s 发布时间 %s 作者 %s 公众号 %s url %s ''' % (article_info['title'], article_info['release_time'], article_info['author'], article_info['account'], article_info['url'])) WechatService._es.add('wechat_article', article_info, article_info.get('article_id')) def add_account_info(self, account_info): log.debug(''' -----公众号信息----- %s''' % tools.dumps_json(account_info)) WechatService._es.add('wechat_account', account_info, account_info.get('__biz'))
print(f"{bcolors.OKGREEN}Client 2:{bcolors.ENDC} " f"{bcolors.OKCYAN}Updating resource '" + RESOURCE_NAME + f"' {bcolors.ENDC}") # Update the resource redis.update(name=RESOURCE_NAME, key="client", value="client_2") redis.update(name=RESOURCE_NAME, key="random", value=random.random()) else: print( f"{bcolors.FAIL}Client 2: Error acquiring the lock on resource '" + RESOURCE_NAME + f"' {bcolors.ENDC}") # Create Redis instance redis = RedisDB() # Create Redlock instance dlm = Redlock([ { "host": "localhost", "port": 6379, "db": 0 }, ]) print(f"{bcolors.OKBLUE}## EXECUTING TEST 6 ##{bcolors.ENDC}") print( f"{bcolors.OKBLUE} Several clients, Several locks, one resource, client blocked {bcolors.ENDC}" )
class Collector(threading.Thread): def __init__(self, tab_urls, depth, process_num = None): ''' @summary: --------- @param tab_urls: @param depth: @param process_num: 进程编号 --------- @result: ''' super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False self._tab_worker_status = 'news:worker_status' self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '') def run(self): while not self._thread_stop: try: self.__input_data() except Exception as e: log.error(e) time.sleep(self._interval) def stop(self): self._thread_stop = True if self._finished_callback: self._finished_callback() # @tools.log_function_time def __input_data(self): if self._urls: log.debug('url 未处理完,不取url, url数量 = %s'%len(self._urls)) return # 汇报节点信息 self._db.zadd(self._tab_worker_status, self._worker_mark, 0) # 未做 url_count = self._url_count # 先赋值 # 根据等待节点数量,动态分配url worker_wait_count = self._db.zget_count(self._tab_worker_status, priority_min = 0, priority_max = 0) if worker_wait_count: # 任务数量 task_count = self._db.zget_count(self._tab_urls) # 动态分配的数量 = 任务数量 / 休息的节点数量 url_count = task_count // worker_wait_count url_count = url_count if url_count <= self._url_count else self._url_count urls_list = self._db.zget(self._tab_urls, count = url_count) if not urls_list: if not self._is_show_wait: log.info('等待任务...') self._is_show_wait = True else: # # 记录url数量 测试用 # url_count_record = tools.read_file('url_count.txt') # url_count_record = url_count_record and int(url_count_record) or 0 # url_count_record += len(urls_list) # tools.write_file('url_count.txt', str(url_count_record)) # 汇报节点信息 self._db.zadd(self._tab_worker_status, self._worker_mark, 1) # 正在做 # 存url self.put_urls(urls_list) self._is_show_wait = False # if self.is_all_have_done(): # log.debug('is_all_have_done end') # self.stop() def is_finished(self): return self._thread_stop def add_finished_callback(self, callback): self._finished_callback = callback # 没有可做的url def is_all_have_done(self): # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count())) if len(self._urls) == 0: self._null_times += 1 if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count() == 0: return True else: return False else: self._null_times = 0 return False # @tools.log_function_time def put_urls(self, urls_list): for url_info in urls_list: try: url_info = eval(url_info) except Exception as e: url_info = None if url_info: self._urls.append(url_info) # @tools.log_function_time def get_urls(self, count): urls = [] count = count if count <= len(self._urls) else len(self._urls) while count: urls.append(self._urls.popleft()) count -= 1 return urls
class TaskService(): _task_ring_buff = RingBuff(TASK_BUFFER_SIZE) _offset = 1 _lock = threading.RLock() _spider_start_timestamp = 0 _spider_end_timestamp = 0 _total_task_size = 0 _db = OracleDB() _redisdb = RedisDB() def __init__(self ): pass def load_task(self): if TaskService._offset == 1: log.info('开始新的一轮抓取') TaskService._spider_start_timestamp = tools.get_current_timestamp() TaskService._total_task_size = 0 # 清空url表 TaskService._redisdb.clear('news:news_urls') TaskService._redisdb.clear('news:news_urls_dupefilter') task_sql = ''' select * from (select t.id, t.name, t.position, t.url, t.depth, rownum r from TAB_IOPM_SITE t where classify = 1 and t.mointor_status = 701 and t.position != 35 and rownum < {page_size}) where r >= {offset} '''.format(page_size = TaskService._offset + TASK_BUFFER_SIZE, offset = TaskService._offset) TaskService._offset += TASK_BUFFER_SIZE print(task_sql) tasks = TaskService._db.find(task_sql) TaskService._total_task_size += len(tasks) if not tasks: TaskService._spider_end_timestamp = tools.get_current_timestamp() log.info('已做完一轮,共处理网站%s个 耗时%s'%(TaskService._total_task_size, tools.seconds_to_h_m_s(TaskService._spider_end_timestamp - TaskService._spider_start_timestamp))) TaskService._offset = 1 self.load_task() TaskService._task_ring_buff.put_data(tasks) def get_task(self, count = TASK_COUNT): TaskService._lock.acquire() #加锁 tasks = TaskService._task_ring_buff.get_data(count) if not tasks: self.load_task() tasks = TaskService._task_ring_buff.get_data(count) TaskService._lock.release() return {'tasks':tasks, 'thread_count':THREAD_COUNT} def update_task_status(self, tasks, status): TaskService._lock.acquire() #加锁 for task in tasks: website_id = task[0] sql = "update tab_iopm_site t set t.spider_time = to_date('%s', 'yyyy-mm-dd :hh24:mi:ss'), t.spider_status = %s where id = %s"%(tools.get_current_date(), status, website_id) TaskService._db.update(sql) TaskService._lock.release()
class Collector(threading.Thread): def __init__(self, tab_urls, depth): super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth # or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int( tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int( tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int( tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False def run(self): while not self._thread_stop: self.__input_data() time.sleep(self._interval) def stop(self): self._thread_stop = True if self._finished_callback: self._finished_callback() # @tools.log_function_time def __input_data(self): if self._urls: log.debug('url 未处理完,不取url, url数量 = %s' % len(self._urls)) return urls_list = self._db.zget(self._tab_urls, count=self._url_count) if not urls_list: if not self._is_show_wait: log.info('等待任务...') self._is_show_wait = True else: # 存url self.put_urls(urls_list) self._is_show_wait = False # if self.is_all_have_done(): # log.debug('is_all_have_done end') # self.stop() def is_finished(self): return self._thread_stop def add_finished_callback(self, callback): self._finished_callback = callback # 没有可做的url def is_all_have_done(self): # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count())) if len(self._urls) == 0: self._null_times += 1 if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count( ) == 0: return True else: return False else: self._null_times = 0 return False # @tools.log_function_time def put_urls(self, urls_list): for url_info in urls_list: try: url_info = eval(url_info) except Exception as e: url_info = None if url_info: self._urls.append(url_info) # @tools.log_function_time def get_urls(self, count): urls = [] count = count if count <= len(self._urls) else len(self._urls) while count: urls.append(self._urls.popleft()) count -= 1 return urls
def __init__(self): self._oracledb = OracleDB() self._redisdb = RedisDB() self._news_url_table = 'news:news_urls' self._news_urls_dupefilter = 'news:news_urls_dupefilter'
class TaskManager(): def __init__(self): self._oracledb = OracleDB() self._redisdb = RedisDB() self._news_url_table = 'news:news_urls' self._news_urls_dupefilter = 'news:news_urls_dupefilter' def get_task_count(self): ''' @summary: redis 中是否有待做的url --------- --------- @result: ''' return self._redisdb.zget_count(self._news_url_table) def get_ever_depth_count(self, total_depth=5): ''' @summary: --------- @param total_depth: 不包含。 以客户角度的层数 --------- @result: ''' depth_count_info = {} total_count = 0 for depth in range(total_depth): key = '第%s层url数' % (depth + 1) depth_count_info[key] = self._redisdb.sget_count( self._news_urls_dupefilter + str(depth)) total_count += depth_count_info[key] depth_count_info['总url数'] = total_count return depth_count_info def get_task_from_oracle(self): tasks = [] offset = 0 while True: # 取任务 task_sql = ''' select * from (select t.id, t.name, t.position, t.url, t.depth, rownum r from TAB_IOPM_SITE t where classify = 1 and t.mointor_status = 701 and (t.position != 35 or t.position is null) and rownum < {page_size}) where r >= {offset} '''.format(page_size=offset + ONE_PAGE_SIZE, offset=offset) results = self._oracledb.find(task_sql) offset += ONE_PAGE_SIZE if not results: break # 拼装成json格式的url for task in results: website_id = task[0] website_name = task[1] website_position = task[2] website_url = task[3] website_domain = tools.get_domain(website_url) spider_depth = task[4] remark = { 'website_name': website_name, 'website_position': website_position, 'website_url': website_url, 'website_domain': website_domain, 'spider_depth': spider_depth } url_dict = { 'site_id': 1, 'url': website_url, 'depth': 0, 'remark': remark, 'retry_times': 0 } tasks.append(url_dict) return tasks def add_task_to_redis(self, tasks): for task in tasks: url = task.get('url') if url: url_id = tools.get_sha1(url) if self._redisdb.sadd(self._news_urls_dupefilter, url_id): self._redisdb.zadd(self._news_url_table, task, prioritys=0) # 下面是统计每层url数量用的表 self._redisdb.sadd('news:news_urls_dupefilter0', url_id) def clear_task(self): # 清空url指纹表 self._redisdb.sdelete('news:news_urls_dupefilter') # 下面是统计每层url数量用的表 self._redisdb.sdelete('news:news_urls_dupefilter0') self._redisdb.sdelete('news:news_urls_dupefilter1') self._redisdb.sdelete('news:news_urls_dupefilter2') self._redisdb.sdelete('news:news_urls_dupefilter3') self._redisdb.sdelete('news:news_urls_dupefilter4')
import socket import os, signal import time from storage_module.dection_ping_proxy import check_ip from retrying import retry # def write(content_info): # f = open('D:\start_get_ip\pid.txt','a',encoding="utf-8") # f.write(str(content_info)+"\n") # f.close() # pid=os.getpid() # print(pid) # write(pid) # while True: # print(1) # time.sleep(34) redis_0 = RedisDB() MAX_POOL = 400 config = os.path.join('D:\proxy\\' + 'config.conf') redis_key = tools.get_conf_value(config, 'redis', 'redis_key') redis_key2 = tools.get_conf_value(config, 'redis', 'redis_key2') bj_ip = socket.gethostbyname(socket.gethostname()) def retry(attempt): def decorator(func): def wrapper(*args, **kw): att = 0 while att < attempt: try: return func(*args, **kw)
class ArticleManager(threading.Thread, Singleton): def __init__(self, table_article = 'articles'): if not hasattr(self,'_table_article'): super(ArticleManager, self).__init__() self._thread_stop = False self._articles_deque = collections.deque() self._db = RedisDB() self._table_article = table_article self._table_article_bloomfilter = table_article + '_bloomfilter' self._bloomfilter = BloomFilter(redis_obj = self._db, key = self._table_article_bloomfilter) def run(self): while not self._thread_stop: try: self.__add_article_to_db() except Exception as e: log.error(e) log.debug('缓存中文章数量 %s'%len(self._articles_deque)) tools.delay_time(1) def stop(self): self._thread_stop = True def put_articles(self, article): self._articles_deque.append(article) if self.get_articles_count() > MAX_ARTICLE_COUNT: # 超过最大缓存,总动调用 self.__add_article_to_db() def get_articles_count(self): return len(self._table_article) def clear_article(self): ''' @summary: 删除redis里的数据 --------- --------- @result: ''' self._db.clear(self._table_article) def __add_article_to_db(self): article_list = [] while self._articles_deque: article = self._articles_deque.popleft() # 查看article是否存在 if self._bloomfilter.is_contains(article.get('uuid')): article_list.append(article) else: self._bloomfilter.insert(article.get('uuid')) if len(article_list) > 100: log.debug('添加article到数据库') self._db.sadd(self._table_article, article_list) article_list = [] if article_list: log.debug('添加article到数据库') self._db.sadd(self._table_article, article_list)
class TaskManager(): IS_IN_TIME_RANGE = 1 # 在时间范围 NOT_REACH_TIME_RANGE = 2 # 没到达时间范围 OVER_MIN_TIME_RANGE = 3 # 超过时间范围 def __init__(self): self._mysqldb = MysqlDB(**config.get('mysqldb')) self._redis = RedisDB(**config.get('redisdb')) self._task_root_key = config.get('spider').get('redis_task_cache_root_key') self._account_task_key = self._task_root_key + ':z_account_task' self._article_task_key = self._task_root_key + ':z_article_task' self._last_article_publish_time = self._task_root_key + ':h_last_article_publish_time' self._new_last_article_publish_time = self._task_root_key + ':h_new_last_article_publish_time' self._ignore_haved_crawl_today_article_account = config.get('spider').get('ignore_haved_crawl_today_article_account') self._monitor_interval = config.get('spider').get('monitor_interval') self._zombie_account_not_publish_article_days = config.get('spider').get('zombie_account_not_publish_article_days') self._spider_interval_min = config.get('spider').get('spider_interval').get('min_sleep_time') self._spider_interval_max = config.get('spider').get('spider_interval').get('max_sleep_time') self._spider_interval_max = config.get('spider').get('spider_interval').get('max_sleep_time') self._crawl_time_range = (config.get("spider").get("crawl_time_range") or "~").split('~') def __get_task_from_redis(self, key): task = self._redis.zget(key, is_pop=True) if task: task = eval(task[0]) return task def __random_int(self, min, max): pass def get_account_task(self): """ 获取公众号任务 :return: {'__biz': 'Mjc1NjM3MjY2MA==', 'last_publish_time': None} 或 None """ task = self.__get_task_from_redis(self._account_task_key) if not task: publish_time_condition = "AND last_publish_time < '{today}'".format(today=tools.get_current_date(date_format='%Y-%m-%d' + ' 00:00:00')) if self._ignore_haved_crawl_today_article_account else '' sql = ''' SELECT __biz, last_publish_time FROM wechat_account_task WHERE `is_zombie` != 1 AND ( ( ( UNIX_TIMESTAMP(CURRENT_TIMESTAMP) - UNIX_TIMESTAMP(last_spider_time) ) > {monitor_interval} {publish_time_condition} ) OR (last_spider_time IS NULL) ) '''.format(monitor_interval=self._monitor_interval, publish_time_condition=publish_time_condition) tasks = self._mysqldb.find(sql, to_json=True) if tasks: self._redis.zadd(self._account_task_key, tasks) task = self.__get_task_from_redis(self._account_task_key) return task def get_article_task(self): """ 获取文章任务 :return: {'article_url': 'http://mp.weixin.qq.com/s?__biz=MzIxNzg1ODQ0MQ==&mid=2247485501&idx=1&sn=92721338ddbf7d907eaf03a70a0715bd&chksm=97f220dba085a9cd2b9a922fb174c767603203d6dbd2a7d3a6dc41b3400a0c477a8d62b96396&scene=27#wechat_redirect'} 或 None """ task = self.__get_task_from_redis(self._article_task_key) if not task: sql = 'select id, article_url from wechat_article_task where state = 0 limit 5000' tasks = self._mysqldb.find(sql) if tasks: # 更新任务状态 task_ids = str(tuple([task[0] for task in tasks])).replace(',)', ')') sql = 'update wechat_article_task set state = 2 where id in %s' % (task_ids) self._mysqldb.update(sql) else: sql = 'select id, article_url from wechat_article_task where state = 2 limit 5000' tasks = self._mysqldb.find(sql) if tasks: task_json = [ { 'article_url': article_url } for id, article_url in tasks ] self._redis.zadd(self._article_task_key, task_json) task = self.__get_task_from_redis(self._article_task_key) return task def update_article_task_state(self, sn, state=1): sql = 'update wechat_article_task set state = %s where sn = "%s"' % (state, sn) self._mysqldb.update(sql) def record_last_article_publish_time(self, __biz, last_publish_time): self._redis.hset(self._last_article_publish_time, __biz, last_publish_time or '') def is_reach_last_article_publish_time(self, __biz, publish_time): last_publish_time = self._redis.hget(self._last_article_publish_time, __biz) if not last_publish_time: # 查询mysql里是否有该任务 sql = "select last_publish_time from wechat_account_task where __biz = '%s'" % __biz data = self._mysqldb.find(sql) if data: # [(None,)] / [] last_publish_time = str(data[0][0] or '') self.record_last_article_publish_time(__biz, last_publish_time) if last_publish_time is None: return if publish_time < last_publish_time: return True return False def is_in_crawl_time_range(self, publish_time): """ 是否在时间范围 :param publish_time: :return: 是否达时间范围 """ if not publish_time or (not self._crawl_time_range[0] and not self._crawl_time_range[1]): return TaskManager.IS_IN_TIME_RANGE if self._crawl_time_range[0]: # 时间范围 上限 if publish_time > self._crawl_time_range[0]: return TaskManager.NOT_REACH_TIME_RANGE if publish_time <= self._crawl_time_range[0] and publish_time >= self._crawl_time_range[1]: return TaskManager.IS_IN_TIME_RANGE if publish_time < self._crawl_time_range[1]: # 下限 return TaskManager.OVER_MIN_TIME_RANGE return TaskManager.IS_IN_TIME_RANGE def record_new_last_article_publish_time(self, __biz, new_last_publish_time): self._redis.hset(self._new_last_article_publish_time, __biz, new_last_publish_time) def get_new_last_article_publish_time(self, __biz): return self._redis.hget(self._new_last_article_publish_time, __biz) def update_account_last_publish_time(self, __biz, last_publish_time): sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}" where __biz="{}"'.format( last_publish_time, tools.get_current_date(), __biz ) self._mysqldb.update(sql) def is_zombie_account(self, last_publish_timestamp): if tools.get_current_timestamp() - last_publish_timestamp > self._zombie_account_not_publish_article_days * 86400: return True return False def sign_account_is_zombie(self, __biz, last_publish_time=None): if last_publish_time: sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}", is_zombie=1 where __biz="{}"'.format( last_publish_time, tools.get_current_date(), __biz ) else: sql = 'update wechat_account_task set last_spider_time="{}", is_zombie=1 where __biz="{}"'.format( tools.get_current_date(), __biz ) self._mysqldb.update(sql) def get_task(self, url=None, tip=''): """ 获取任务 :param url: 指定url时,返回该url包装后的任务。否则先取公众号任务,无则取文章任务。若均无任务,则休眠一段时间之后再取 :return: """ sleep_time = random.randint(self._spider_interval_min, self._spider_interval_max) if not url: account_task = self.get_account_task() if account_task: __biz = account_task.get('__biz') last_publish_time = account_task.get('last_publish_time') self.record_last_article_publish_time(__biz, last_publish_time) tip = '正在抓取列表' url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}&scene=124#wechat_redirect'.format(__biz) else: article_task = self.get_article_task() if article_task: tip = '正在抓取详情' url = article_task.get('article_url') else: sleep_time = config.get('spider').get('no_task_sleep_time') log.info('暂无任务 休眠 {}s'.format(sleep_time)) tip = '暂无任务 ' if url: next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.href='{url}';}},{sleep_time_msec});</script>".format( tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time), url=url, sleep_time_msec=sleep_time * 1000 ) else: next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.reload();}},{sleep_time_msec});</script>".format( tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time), sleep_time_msec=sleep_time * 1000 ) return next_page def reset_task(self): # 清除redis缓存 keys = self._task_root_key + "*" keys = self._redis.getkeys(keys) if keys: for key in keys: self._redis.clear(key) # 重设任务 sql = "update wechat_article_task set state = 0 where state = 2" self._mysqldb.update(sql)
def __init__(self): self._oracledb = OracleDB() self._redisdb = RedisDB() self._wechat_sogo = WechatSogou()
def __init__(self): self.redis = RedisDB() self.test_url = "https://movie.douban.com/"
def __init__(self): self.redis = RedisDB()
def __init__(self): super(SyncArtice, self).__init__() self._es = ES() self._redis = RedisDB() self._sync_count = 0
class CheckNewArticle(): def __init__(self): self._oracledb = OracleDB() self._redisdb = RedisDB() self._wechat_sogo = WechatSogou() def get_wait_check_account(self): ''' @summary: --------- @param : --------- @result: ''' # 取抓取完的公众号,且最近发布时间已过去两小时,则再次监测是否又发布新文章 before_tow_hours = tools.timestamp_to_date( tools.get_current_timestamp() - 60 * 60 * 2) sql = ''' select t.id, t.domain, t.name, to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'), t.biz from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and t.spider_status = 603 and (t.last_article_release_time is null or t.last_article_release_time <= to_date('{}', 'yyyy-mm-dd hh24:mi:ss')) '''.format(before_tow_hours) accounts = self._oracledb.find(sql) # 若无抓取完的公众号,且redis中无抓取任务,则数据库中非603任务可能为丢失任务,需要重新下发 if not accounts and not self._redisdb.sget_count('wechat:account'): sql = ''' select t.id, t.domain, t.name, to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'), t.biz from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and t.spider_status != 603 ''' accounts = self._oracledb.find(sql) return accounts def check_new_article(self, account): oralce_id, account_id, account_name, last_article_release_time, biz = account article_release_time = self._wechat_sogo.get_article_release_time( account_id=account_id, account=account_name) print(article_release_time) if article_release_time: last_article_release_time = last_article_release_time or '' if article_release_time >= tools.get_current_date( '%Y-%m-%d' ) and article_release_time > last_article_release_time: print('{} 有新文章发布,等待抓取。 发布时间:{}'.format(account_name, article_release_time)) sql = ''' update TAB_IOPM_SITE t set t.spider_status = 601, t.last_article_release_time = to_date('{}', 'yyyy-mm-dd hh24:mi:ss') where id = {} '''.format(article_release_time, oralce_id) # 多线程, 数据库需每个线程持有一个 oracledb = OracleDB() oracledb.update(sql) oracledb.close() # 入redis, 作为微信爬虫的任务池 data = (oralce_id, account_id, account_name, last_article_release_time, biz) self._redisdb.sadd('wechat:account', data)