def distribute_task(self): """ @summary: 分发任务 并将返回的request入库 --------- --------- @result: """ self._is_distributed_task = False for parser in self._parsers: requests = parser.__start_requests() if requests and not isinstance(requests, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for request in requests or []: if isinstance(request, Request): request.parser_name = request.parser_name or parser.name self._request_buffer.put_request(request) self._is_distributed_task = True result_type = 1 elif isinstance(request, Item): self._item_buffer.put_item(request) result_type = 2 elif callable(request): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(request) else: self._item_buffer.put_item(request) self._request_buffer.flush() self._item_buffer.flush() if self._is_distributed_task: # 有任务时才提示启动爬虫 # begin self.spider_begin() self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 重置已经提示无任务状态为False self._is_show_not_task = False elif not self._is_show_not_task: # 无任务,且没推送过无任务信息 # 发送无任务消息 msg = "《%s》start_requests无任务添加" % (self._spider_name) log.info(msg) # self.send_msg(msg) self._is_show_not_task = True
def record_batch(self): """ @summary: 记录批次信息(初始化) --------- --------- @result: """ # 查询总任务数 sql = "select count(1) from %s%s" % ( self._task_table, self._task_condition_prefix_where, ) total_task_count = self._mysqldb.find(sql)[0][0] batch_date = tools.get_current_date(self._date_format) sql = ( "insert into %s (batch_date, done_count, total_count, `interval`, interval_unit, create_time) values ('%s', %s, %s, %s, '%s', CURRENT_TIME)" % ( self._batch_record_table, batch_date, 0, total_task_count, self._batch_interval if self._batch_interval >= 1 else self._batch_interval * 24, "day" if self._batch_interval >= 1 else "hour", ) ) affect_count = self._mysqldb.add(sql) # None / 0 / 1 (1 为成功) if affect_count: # 重置批次日期 self._batch_date_cache = batch_date # 重新刷下self.batch_date 中的 os.environ.get('batch_date') 否则日期还停留在上一个批次 os.environ["batch_date"] = self._batch_date_cache # 爬虫开始 self.spider_begin() self.record_spider_state( spider_type=2, state=0, batch_date=batch_date, spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) else: log.error("插入新批次失败") return affect_count
def run(self): if not self.is_reach_next_spider_time(): return self._start() while True: if self.all_thread_is_done(): if not self._is_notify_end: self.spider_end() # 跑完一轮 self.record_spider_state( spider_type=1, state=1, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if self._auto_stop_when_spider_done: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() tools.delay_time(1) # 1秒钟检查一次爬虫状态
def run(self): if not self.is_reach_next_spider_time(): return self._start() while True: try: self.heartbeat() if self.all_thread_is_done(): if not self._is_notify_end: self.spider_end() # 跑完一轮 self.record_spider_state( spider_type=1, state=1, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if not self._keep_alive: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() except Exception as e: log.exception(e) tools.delay_time(1) # 1秒钟检查一次爬虫状态
def __add_task(self): # 启动parser 的 start_requests self.spider_begin() # 不自动结束的爬虫此处只能执行一遍 self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 判断任务池中属否还有任务,若有接着抓取 todo_task_count = self._collector.get_requests_count() if todo_task_count: log.info("检查到有待做任务 %s 条,不重下发新任务。将接着上回异常终止处继续抓取" % todo_task_count) else: for parser in self._parsers: results = parser.start_requests() # 添加request到请求队列,由请求队列统一入库 if results and not isinstance(results, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for result in results or []: if isinstance(result, Request): result.parser_name = result.parser_name or parser.name self._request_buffer.put_request(result) result_type = 1 elif isinstance(result, Item): self._item_buffer.put_item(result) result_type = 2 elif callable(result): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(result) else: self._item_buffer.put_item(result) else: raise TypeError( "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format( type(result) ) ) self._request_buffer.flush() self._item_buffer.flush()
def parse(self, request, response): title = response.xpath("string(//div[@class='sku-name'])").extract_first(default="").strip() item = Item() item.table_name = "jd_item" # 指定入库的表名 item.title = title item.batch_date = self.batch_date # 获取批次信息,批次信息框架自己维护 item.crawl_time = tools.get_current_date() # 获取当前时间 yield item # 自动批量入库 yield self.update_task_batch(request.task_id, 1) # 更新任务状态
def parse_seats(self, request, response): """ 解析客座率等信息 """ if re.search("本场次暂未开放售票", response.text): return movie_id = request.movie_id movie_name = response.xpath( '//p[@class="name text-ellipsis"]/text()').extract_first() cinema_id = tools.get_param(request.url, "movieId") cinema_name = response.re_first("影院 :</span>.*?'>(.*?)<", default="").strip() screen = response.re_first("影厅 :</span>.*?'>(.*?)<", default="").strip() version = response.re_first("版本 :</span>.*?'>(.*?)<", default="").strip() show_time = response.re_first("场次 :</span>.*?'>(.*?)<", default="").strip() price = response.re_first("票价 :</span>.*?'>(.*?)<", default="").strip() show_time = " ".join(show_time.split(" ")[1:]) show_time = tools.format_time(show_time) seat_selectable_count = len( response.xpath('//span[@class="seat selectable"]')) seat_sold_count = len(response.xpath('//span[@class="seat sold"]')) seat_total_count = seat_selectable_count + seat_sold_count # 入库 item = maoyan_film_detail_item.MaoyanFilmDetailItem() item.movie_id = movie_id item.movie_name = movie_name item.city_id = request.city_id item.cinema_id = cinema_id item.cinema_name = cinema_name item.screen = screen item.version = version item.price = price item.show_time = show_time item.seat_sold_count = seat_sold_count item.seat_total_count = seat_total_count item.url = response.url item.crawl_time = tools.get_current_date() yield item # 生产开播前5分钟再采集一次的详情任务 snapshot_task_item = (maoyan_film_detail_snapshot_task_item. MaoyanFilmDetailSnapshotTaskItem()) snapshot_task_item.movie_id = movie_id # 电影id snapshot_task_item.city_id = request.city_id # 城市id snapshot_task_item.url = response.url # 售票地址 snapshot_task_item.show_time = show_time # 播出时间 snapshot_task_item.crawl_time = None # 采集时间 yield snapshot_task_item
def run(self): """ @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止 --------- --------- @result: """ try: self.create_batch_record_table() if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) self._start() while True: try: self.heartbeat() if ( self.task_is_done() and self.all_thread_is_done() ): # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况) if not self._is_notify_end: self.spider_end() self.record_spider_state( spider_type=2, state=1, batch_date=self._batch_date_cache, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if not self._keep_alive: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() except Exception as e: log.exception(e) tools.delay_time(10) # 10秒钟检查一次爬虫状态 except Exception as e: msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e) log.error(msg) self.send_msg(msg, level="error", message_prefix="《%s》爬虫异常结束".format(self._batch_name)) os._exit(137) # 使退出码为35072 方便爬虫管理器重启
def parse(self, request, response): """ 解析影院列表 """ cinema_list = response.xpath('//div[@class="cinema-cell"]') for cinema in cinema_list: # 电影院信息 name = cinema.xpath( './/a[@class="cinema-name"]/text()').extract_first() address = cinema.xpath( './/p[@class="cinema-address"]/text()').extract_first() cinema_id = cinema.xpath('.//a[@class="cinema-name"]/@data-val' ).re_first("cinema_id: (\d*)") tags = cinema.xpath( ".//div[@class='cinema-tags']//span/text()").extract() # 电影院列表入库 item = maoyan_cinema_list_item.MaoyanCinemaListItem() item.name = name item.address = address item.cinema_id = cinema_id item.tags = tags item.crawl_time = tools.get_current_date() item.city_id = request.city_id yield item # 采集场次 url = cinema.xpath( './/a[@class="cinema-name"]/@href').extract_first() next_request = request.copy() next_request.url = url next_request.cinema_id = cinema_id next_request.callback = self.parse_play_time yield next_request # 翻页 page = request.page page_urls = response.xpath( '//ul[@class="list-pager"]//a/@href').extract() for page_url in page_urls[1:-1]: page += 1 yield feapder.Request( page_url, city_id=request.city_id, movie_id=request.movie_id, show_date=request.show_date, brand_id=request.brand_id, page=page, task_id=request.task_id, ) if request.page == 1: # 更新任务 yield self.update_task_batch(request.task_id, 1)
def deal_file_info(file): file = file.replace("{DATE}", tools.get_current_date()) file = file.replace("{USER}", getpass.getuser()) return file
def deal_file_info(file): file = file.replace("{DATE}", tools.get_current_date()) file = file.replace("{USER}", os.getenv("FEAPDER_USER") or getpass.getuser()) return file
def parse(self, request, response): movie_list = response.xpath("//dl[@class='movie-list']//dd") for movie in movie_list: cover = movie.xpath( './div[@class="movie-item film-channel"]//div[@class="movie-poster"]/img[2]/@data-src' ).extract_first() name = movie.xpath( './div[@class="channel-detail movie-item-title"]/@title' ).extract_first() url = movie.xpath( './div[@class="channel-detail movie-item-title"]/a/@href' ).extract_first() score = movie.xpath( 'string(./div[@class="channel-detail channel-detail-orange"])' ).extract_first() movie_type = movie.re_first("类型:</span>(.*?)<", default="").strip() main_actor = movie.re_first("主演:</span>(.*?)<", default="").strip() release_date = movie.re_first("上映时间:</span>(.*?)<", default="").strip() imax = movie.xpath( './div[@class="movie-item film-channel"]//div[@class="movie-ver"]/i/@class' ).extract_first() movie_id = url.split("/")[-1] item = maoyan_hot_movie_list_item.MaoyanHotMovieListItem() item.name = name item.movie_id = movie_id item.cover = cover item.url = url item.score = score item.movie_type = movie_type item.main_actor = main_actor item.release_date = release_date item.imax = imax item.city_id = request.city_id item.crawl_time = tools.get_current_date() yield item # 电影详情任务 for brand_id in setting.BRAND_IDS: detail_task_item = ( maoyan_film_detail_task_item.MaoyanFilmDetailTaskItem()) detail_task_item.movie_id = movie_id # 电影id detail_task_item.city_id = request.city_id # 城市id detail_task_item.brand_id = brand_id # 品牌 -1表示全部 detail_task_item.show_date = tools.get_current_date( "%Y-%m-%d") # 日期 yield detail_task_item # 翻页 if request.page == 1: total_page = response.xpath( '//ul[@class="list-pager"]//li[last()-1]/a/text()' ).extract_first() if total_page: total_page = int(total_page) for page in range(2, total_page + 1): yield feapder.Request(page=page, city_id=request.city_id, task_id=request.task_id) # 更新任务 yield self.update_task_batch(request.task_id, 1)