def distribute_task(self, *args, **kws): """ @summary: 分发任务 并将返回的request入库 --------- @param tasks: --------- @result: """ self._is_distributed_task = False for parser in self._parsers: requests = parser.__start_requests(*args, **kws) if requests and not isinstance(requests, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for request in requests or []: if isinstance(request, Request): request.parser_name = request.parser_name or parser.name self._request_buffer.put_request(request) self._is_distributed_task = True result_type = 1 elif isinstance(request, Item): self._item_buffer.put_item(request) result_type = 2 elif callable(request): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(request) else: self._item_buffer.put_item(request) self._request_buffer.flush() self._item_buffer.flush() if self._is_distributed_task: # 有任务时才提示启动爬虫 # begin self.spider_begin() self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 重置已经提示无任务状态为False self._is_show_not_task = False elif not self._is_show_not_task: # 无任务,且没推送过无任务信息 # 发送无任务消息 msg = "《%s》start_requests无任务添加" % (self._spider_name) log.info(msg) # self.send_msg(msg) self._is_show_not_task = True
def record_batch(self): """ @summary: 记录批次信息(初始化) --------- --------- @result: """ # 查询总任务数 sql = "select count(1) from %s%s" % ( self._task_table, self._task_condition_prefix_where, ) total_task_count = self._mysqldb.find(sql)[0][0] batch_date = tools.get_current_date(self._date_format) sql = ( "insert into %s (batch_date, done_count, total_count, `interval`, interval_unit, create_time) values ('%s', %s, %s, %s, '%s', CURRENT_TIME)" % ( self._batch_record_table, batch_date, 0, total_task_count, self._batch_interval if self._batch_interval >= 1 else self._batch_interval * 24, "day" if self._batch_interval >= 1 else "hour", ) ) affect_count = self._mysqldb.add(sql) # None / 0 / 1 (1 为成功) if affect_count: # 重置批次日期 self._batch_date_cache = batch_date # 重新刷下self.batch_date 中的 os.environ.get('batch_date') 否则日期还停留在上一个批次 os.environ["batch_date"] = self._batch_date_cache # 爬虫开始 self.spider_begin() self.record_spider_state( spider_type=2, state=0, batch_date=batch_date, spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) else: log.error("插入新批次失败") return affect_count
def run(self): if not self.is_reach_next_spider_time(): return self._start() while True: if self.all_thread_is_done(): if not self._is_notify_end: self.spider_end() # 跑完一轮 self.record_spider_state( spider_type=1, state=1, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if self._auto_stop_when_spider_done: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() tools.delay_time(1) # 1秒钟检查一次爬虫状态
def __add_task(self): # 启动parser 的 start_requests self.spider_begin() # 不自动结束的爬虫此处只能执行一遍 self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 判断任务池中属否还有任务,若有接着抓取 todo_task_count = self._collector.get_requests_count() if todo_task_count: log.info("检查到有待做任务 %s 条,不重下发新任务。将接着上回异常终止处继续抓取" % todo_task_count) else: for parser in self._parsers: results = parser.start_requests(*self._parser_args, **self._parser_kwargs) # 添加request到请求队列,由请求队列统一入库 if results and not isinstance(results, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for result in results or []: if isinstance(result, Request): result.parser_name = result.parser_name or parser.name self._request_buffer.put_request(result) result_type = 1 elif isinstance(result, Item): self._item_buffer.put_item(result) result_type = 2 elif callable(result): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(result) else: self._item_buffer.put_item(result) else: raise TypeError( "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}" .format(type(result))) self._request_buffer.flush() self._item_buffer.flush()
def detail_content(self, response): request = response.request data = request.meta["data"] _response = response.response s = Selector(response=_response) data["content"]= s.xpath('.//div[@style="width: 1105px;margin:0 auto"]').extract_first() ctime=s.xpath('.//span[@class="datetime"]/text()').extract_first() data["ctime"]=parser.parse(ctime).strftime("%Y-%m-%d %H:%M:%S") data["gtime"] = tools.get_current_date() if data: # self.store_data(data, table_name="test", mysql=True, ) print("mysql入库成功") print(data) yield data
def detail_content(self, response): request = response.request data = request.meta["data"] _response = response.response soup = BeautifulSoup(_response.text, "html.parser") data["content"] = soup.select_one( 'div[style="width: 1105px;margin:0 auto"]').decode() ctime = soup.select_one('span.datetime').text data["ctime"] = parser.parse(ctime).strftime("%Y-%m-%d %H:%M:%S") data["gtime"] = tools.get_current_date() if data: self.store_data(data, table_name="test", mysql=True, oss=False) print("mysql入库成功") print(data) yield data
def parse(self, response: Response): request = response.request task_obj = request.meta["task"] url = task_obj.get("url") _response = response.response try: if _response: # todo 解析代码 soup = BeautifulSoup(_response.text, "html.parser") rows = soup.select('ul.xinxi_ul li') dataList = [] for item in rows: data = {} data['url'] = urljoin(_response.url, item.select_one("a").attrs["href"]) data['title'] = item.select_one("a").text data['gtime'] = tools.get_current_date() data['ctime'] = parser.parse(item.select_one( "span").text).strftime("%Y-%m-%d %H:%M:%S") data["batch_date"] = self.local_batch_data dataList.append(data) print(data) if len(dataList) > 0: if not self.debug: self.store_data(dataList, table_name=self.task_data_table, oss=False) print("数据表存储成功") if not self.debug: # 更新完成标志 self.set_task_state(setting.TASK_FINISH, condition={"url": url}) logger.debug("任务完成 {}".format(task_obj)) else: if _response is not None: if _response.status_code == 404: url = _response.url # todo 解析代码 # 更新完成标志 self.set_task_state(state=-1, condition={"url": url}) return raise Exception except Exception as e: logger.exception(e) self.put_task(task_obj) return
def parse(self, response: Response): request = response.request task_obj = request.meta["task"] url = task_obj.get("url") _response = response.response try: if _response: # todo 解析代码 soup = BeautifulSoup(_response.text, "html.parser") data = {} data['url'] = _response.url data['title'] = soup.select_one( 'span[style="font-size: 20px;font-weight: bold"]').text data['ctime'] = parser.parse( soup.select_one("span.datetime").text).strftime( "%Y-%m-%d %H:%M:%S") data["gtime"] = tools.get_current_date() data['content'] = soup.select_one( 'div[style="width: 1105px;margin:0 auto"]').decode() data["batch_date"] = self.local_batch_data print(data) if not self.debug: self.store_data([data], table_name=self.task_data_table, oss=False) print("数据表存储成功") if not self.debug: # 更新完成标志 self.set_task_state(setting.TASK_FINISH, condition={"url": url}) logger.debug("任务完成 {}".format(task_obj)) else: if _response is not None: if _response.status_code == 404: url = _response.url # todo 解析代码 # 更新完成标志 self.set_task_state(state=-1, condition={"url": url}) return raise Exception except Exception as e: logger.exception(e) self.put_task(task_obj) return
def run(self): """ @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止 --------- --------- @result: """ try: self.create_batch_record_table() if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) self._start() while True: if ( self.task_is_done() and self.all_thread_is_done() ): # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况) if not self._is_notify_end: self.spider_end() self.record_spider_state( spider_type=2, state=1, batch_date=self._batch_date_cache, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if self._auto_stop_when_spider_done: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() tools.delay_time(10) # 10秒钟检查一次爬虫状态 except Exception as e: msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e) log.error(msg) self.send_msg(msg) os._exit(137) # 使退出码为35072 方便爬虫管理器重启
def _start(self): if self._auto_start_requests: # 将添加任务处加锁,防止多进程之间添加重复的任务 with RedisLock( key=self._spider_name, timeout=3600, wait_timeout=60, redis_uri="redis://:{password}@{host_post}/{db}".format( password=setting.REDISDB_USER_PASS, host_post=setting.REDISDB_IP_PORTS, db=setting.REDISDB_DB, ), ) as lock: if lock.locked: # 启动parser 的 start_requests self.spider_begin() # 不自动结束的爬虫此处只能执行一遍 self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 判断任务池中属否还有任务,若有接着抓取 todo_task_count = self._collector.get_requests_count() if todo_task_count: log.info( "检查到有待做任务 %s 条,不重下发新任务。将接着上回异常终止处继续抓取" % todo_task_count ) else: for parser in self._parsers: results = parser.start_requests( *self._parser_args, **self._parser_kwargs ) # 添加request到请求队列,由请求队列统一入库 if results and not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, "start_requests") ) result_type = 1 for result in results or []: if isinstance(result, Request): result.parser_name = ( result.parser_name or parser.name ) self._request_buffer.put_request(result) result_type = 1 elif isinstance(result, Item): self._item_buffer.put_item(result) result_type = 2 elif callable(result): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(result) else: self._item_buffer.put_item(result) else: raise TypeError( "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format( type(result) ) ) self._request_buffer.flush() self._item_buffer.flush() # 启动collector self._collector.start() # 启动parser control for i in range(self._parser_count): parser_control = self._parser_control_obj( self._collector, self._table_folder, self._request_buffer, self._item_buffer, ) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_controls.append(parser_control) # 启动request_buffer self._request_buffer.start() # 启动item_buffer self._item_buffer.start()
def deal_file_info(file): file = file.replace("{DATE}", tools.get_current_date()) file = file.replace("{USER}", os.getenv("USER")) return file
def deal_file_info(file): file = file.replace("{DATE}", tools.get_current_date()) file = file.replace("{USER}", getpass.getuser()) return file