Exemple #1
0
    def distribute_task(self):
        """
        @summary: 分发任务 并将返回的request入库
        ---------
        ---------
        @result:
        """
        self._is_distributed_task = False

        for parser in self._parsers:
            requests = parser.__start_requests()
            if requests and not isinstance(requests, Iterable):
                raise Exception("%s.%s返回值必须可迭代" %
                                (parser.name, "start_requests"))

            result_type = 1
            for request in requests or []:
                if isinstance(request, Request):
                    request.parser_name = request.parser_name or parser.name
                    self._request_buffer.put_request(request)

                    self._is_distributed_task = True
                    result_type = 1

                elif isinstance(request, Item):
                    self._item_buffer.put_item(request)
                    result_type = 2

                elif callable(request):  # callbale的request可能是更新数据库操作的函数
                    if result_type == 1:
                        self._request_buffer.put_request(request)
                    else:
                        self._item_buffer.put_item(request)

            self._request_buffer.flush()
            self._item_buffer.flush()

        if self._is_distributed_task:  # 有任务时才提示启动爬虫
            # begin
            self.spider_begin()
            self.record_spider_state(
                spider_type=1,
                state=0,
                batch_date=tools.get_current_date(),
                spider_start_time=tools.get_current_date(),
                batch_interval=self._batch_interval,
            )

            # 重置已经提示无任务状态为False
            self._is_show_not_task = False

        elif not self._is_show_not_task:  # 无任务,且没推送过无任务信息
            # 发送无任务消息
            msg = "《%s》start_requests无任务添加" % (self._spider_name)
            log.info(msg)

            # self.send_msg(msg)

            self._is_show_not_task = True
Exemple #2
0
    def record_batch(self):
        """
        @summary: 记录批次信息(初始化)
        ---------
        ---------
        @result:
        """

        # 查询总任务数
        sql = "select count(1) from %s%s" % (
            self._task_table,
            self._task_condition_prefix_where,
        )
        total_task_count = self._mysqldb.find(sql)[0][0]

        batch_date = tools.get_current_date(self._date_format)

        sql = (
            "insert into %s (batch_date, done_count, total_count, `interval`, interval_unit, create_time) values ('%s', %s, %s, %s, '%s', CURRENT_TIME)"
            % (
                self._batch_record_table,
                batch_date,
                0,
                total_task_count,
                self._batch_interval
                if self._batch_interval >= 1
                else self._batch_interval * 24,
                "day" if self._batch_interval >= 1 else "hour",
            )
        )

        affect_count = self._mysqldb.add(sql)  # None / 0 / 1 (1 为成功)
        if affect_count:
            # 重置批次日期
            self._batch_date_cache = batch_date
            # 重新刷下self.batch_date 中的 os.environ.get('batch_date') 否则日期还停留在上一个批次
            os.environ["batch_date"] = self._batch_date_cache

            # 爬虫开始
            self.spider_begin()
            self.record_spider_state(
                spider_type=2,
                state=0,
                batch_date=batch_date,
                spider_start_time=tools.get_current_date(),
                batch_interval=self._batch_interval,
            )
        else:
            log.error("插入新批次失败")

        return affect_count
Exemple #3
0
    def run(self):
        if not self.is_reach_next_spider_time():
            return

        self._start()

        while True:
            if self.all_thread_is_done():
                if not self._is_notify_end:
                    self.spider_end()  # 跑完一轮
                    self.record_spider_state(
                        spider_type=1,
                        state=1,
                        spider_end_time=tools.get_current_date(),
                        batch_interval=self._batch_interval,
                    )

                    self._is_notify_end = True

                if self._auto_stop_when_spider_done:
                    self._stop_all_thread()
                    break

            else:
                self._is_notify_end = False

            self.check_task_status()

            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
Exemple #4
0
    def run(self):
        if not self.is_reach_next_spider_time():
            return

        self._start()

        while True:
            try:
                self.heartbeat()
                if self.all_thread_is_done():
                    if not self._is_notify_end:
                        self.spider_end()  # 跑完一轮
                        self.record_spider_state(
                            spider_type=1,
                            state=1,
                            spider_end_time=tools.get_current_date(),
                            batch_interval=self._batch_interval,
                        )

                        self._is_notify_end = True

                    if not self._keep_alive:
                        self._stop_all_thread()
                        break

                else:
                    self._is_notify_end = False

                self.check_task_status()

            except Exception as e:
                log.exception(e)

            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
Exemple #5
0
    def __add_task(self):
        # 启动parser 的 start_requests
        self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
        self.record_spider_state(
            spider_type=1,
            state=0,
            batch_date=tools.get_current_date(),
            spider_start_time=tools.get_current_date(),
            batch_interval=self._batch_interval,
        )

        # 判断任务池中属否还有任务,若有接着抓取
        todo_task_count = self._collector.get_requests_count()
        if todo_task_count:
            log.info("检查到有待做任务 %s 条,不重下发新任务。将接着上回异常终止处继续抓取" % todo_task_count)
        else:
            for parser in self._parsers:
                results = parser.start_requests()
                # 添加request到请求队列,由请求队列统一入库
                if results and not isinstance(results, Iterable):
                    raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))

                result_type = 1
                for result in results or []:
                    if isinstance(result, Request):
                        result.parser_name = result.parser_name or parser.name
                        self._request_buffer.put_request(result)
                        result_type = 1

                    elif isinstance(result, Item):
                        self._item_buffer.put_item(result)
                        result_type = 2

                    elif callable(result):  # callbale的request可能是更新数据库操作的函数
                        if result_type == 1:
                            self._request_buffer.put_request(result)
                        else:
                            self._item_buffer.put_item(result)
                    else:
                        raise TypeError(
                            "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
                                type(result)
                            )
                        )

                self._request_buffer.flush()
                self._item_buffer.flush()
Exemple #6
0
    def parse(self, request, response):
        title = response.xpath("string(//div[@class='sku-name'])").extract_first(default="").strip()

        item = Item()
        item.table_name = "jd_item"  # 指定入库的表名
        item.title = title
        item.batch_date = self.batch_date  # 获取批次信息,批次信息框架自己维护
        item.crawl_time = tools.get_current_date()  # 获取当前时间
        yield item  # 自动批量入库
        yield self.update_task_batch(request.task_id, 1)  # 更新任务状态
    def parse_seats(self, request, response):
        """
        解析客座率等信息
        """
        if re.search("本场次暂未开放售票", response.text):
            return

        movie_id = request.movie_id
        movie_name = response.xpath(
            '//p[@class="name text-ellipsis"]/text()').extract_first()
        cinema_id = tools.get_param(request.url, "movieId")
        cinema_name = response.re_first("影院 :</span>.*?'>(.*?)<",
                                        default="").strip()
        screen = response.re_first("影厅 :</span>.*?'>(.*?)<",
                                   default="").strip()
        version = response.re_first("版本 :</span>.*?'>(.*?)<",
                                    default="").strip()
        show_time = response.re_first("场次 :</span>.*?'>(.*?)<",
                                      default="").strip()
        price = response.re_first("票价 :</span>.*?'>(.*?)<", default="").strip()

        show_time = " ".join(show_time.split(" ")[1:])
        show_time = tools.format_time(show_time)

        seat_selectable_count = len(
            response.xpath('//span[@class="seat selectable"]'))
        seat_sold_count = len(response.xpath('//span[@class="seat sold"]'))
        seat_total_count = seat_selectable_count + seat_sold_count

        # 入库
        item = maoyan_film_detail_item.MaoyanFilmDetailItem()
        item.movie_id = movie_id
        item.movie_name = movie_name
        item.city_id = request.city_id
        item.cinema_id = cinema_id
        item.cinema_name = cinema_name
        item.screen = screen
        item.version = version
        item.price = price
        item.show_time = show_time
        item.seat_sold_count = seat_sold_count
        item.seat_total_count = seat_total_count
        item.url = response.url
        item.crawl_time = tools.get_current_date()
        yield item

        # 生产开播前5分钟再采集一次的详情任务
        snapshot_task_item = (maoyan_film_detail_snapshot_task_item.
                              MaoyanFilmDetailSnapshotTaskItem())
        snapshot_task_item.movie_id = movie_id  # 电影id
        snapshot_task_item.city_id = request.city_id  # 城市id
        snapshot_task_item.url = response.url  # 售票地址
        snapshot_task_item.show_time = show_time  # 播出时间
        snapshot_task_item.crawl_time = None  # 采集时间
        yield snapshot_task_item
Exemple #8
0
    def run(self):
        """
        @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止
        ---------
        ---------
        @result:
        """
        try:
            self.create_batch_record_table()

            if not self._parsers:  # 不是add_parser 模式
                self._parsers.append(self)

            self._start()

            while True:
                try:
                    self.heartbeat()
                    if (
                            self.task_is_done() and self.all_thread_is_done()
                    ):  # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况)
                        if not self._is_notify_end:
                            self.spider_end()
                            self.record_spider_state(
                                spider_type=2,
                                state=1,
                                batch_date=self._batch_date_cache,
                                spider_end_time=tools.get_current_date(),
                                batch_interval=self._batch_interval,
                            )

                            self._is_notify_end = True

                        if not self._keep_alive:
                            self._stop_all_thread()
                            break
                    else:
                        self._is_notify_end = False

                    self.check_task_status()

                except Exception as e:
                    log.exception(e)

                tools.delay_time(10)  # 10秒钟检查一次爬虫状态

        except Exception as e:
            msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e)
            log.error(msg)
            self.send_msg(msg,
                          level="error",
                          message_prefix="《%s》爬虫异常结束".format(self._batch_name))

            os._exit(137)  # 使退出码为35072 方便爬虫管理器重启
    def parse(self, request, response):
        """
        解析影院列表
        """
        cinema_list = response.xpath('//div[@class="cinema-cell"]')
        for cinema in cinema_list:
            # 电影院信息
            name = cinema.xpath(
                './/a[@class="cinema-name"]/text()').extract_first()
            address = cinema.xpath(
                './/p[@class="cinema-address"]/text()').extract_first()
            cinema_id = cinema.xpath('.//a[@class="cinema-name"]/@data-val'
                                     ).re_first("cinema_id: (\d*)")
            tags = cinema.xpath(
                ".//div[@class='cinema-tags']//span/text()").extract()
            # 电影院列表入库
            item = maoyan_cinema_list_item.MaoyanCinemaListItem()
            item.name = name
            item.address = address
            item.cinema_id = cinema_id
            item.tags = tags
            item.crawl_time = tools.get_current_date()
            item.city_id = request.city_id
            yield item

            # 采集场次
            url = cinema.xpath(
                './/a[@class="cinema-name"]/@href').extract_first()
            next_request = request.copy()
            next_request.url = url
            next_request.cinema_id = cinema_id
            next_request.callback = self.parse_play_time
            yield next_request

        # 翻页
        page = request.page
        page_urls = response.xpath(
            '//ul[@class="list-pager"]//a/@href').extract()
        for page_url in page_urls[1:-1]:
            page += 1
            yield feapder.Request(
                page_url,
                city_id=request.city_id,
                movie_id=request.movie_id,
                show_date=request.show_date,
                brand_id=request.brand_id,
                page=page,
                task_id=request.task_id,
            )

        if request.page == 1:
            # 更新任务
            yield self.update_task_batch(request.task_id, 1)
def deal_file_info(file):
    file = file.replace("{DATE}", tools.get_current_date())
    file = file.replace("{USER}", getpass.getuser())

    return file
Exemple #11
0
def deal_file_info(file):
    file = file.replace("{DATE}", tools.get_current_date())
    file = file.replace("{USER}",
                        os.getenv("FEAPDER_USER") or getpass.getuser())

    return file
    def parse(self, request, response):
        movie_list = response.xpath("//dl[@class='movie-list']//dd")
        for movie in movie_list:
            cover = movie.xpath(
                './div[@class="movie-item film-channel"]//div[@class="movie-poster"]/img[2]/@data-src'
            ).extract_first()
            name = movie.xpath(
                './div[@class="channel-detail movie-item-title"]/@title'
            ).extract_first()
            url = movie.xpath(
                './div[@class="channel-detail movie-item-title"]/a/@href'
            ).extract_first()
            score = movie.xpath(
                'string(./div[@class="channel-detail channel-detail-orange"])'
            ).extract_first()
            movie_type = movie.re_first("类型:</span>(.*?)<", default="").strip()
            main_actor = movie.re_first("主演:</span>(.*?)<", default="").strip()
            release_date = movie.re_first("上映时间:</span>(.*?)<",
                                          default="").strip()
            imax = movie.xpath(
                './div[@class="movie-item film-channel"]//div[@class="movie-ver"]/i/@class'
            ).extract_first()
            movie_id = url.split("/")[-1]

            item = maoyan_hot_movie_list_item.MaoyanHotMovieListItem()
            item.name = name
            item.movie_id = movie_id
            item.cover = cover
            item.url = url
            item.score = score
            item.movie_type = movie_type
            item.main_actor = main_actor
            item.release_date = release_date
            item.imax = imax
            item.city_id = request.city_id
            item.crawl_time = tools.get_current_date()
            yield item

            # 电影详情任务
            for brand_id in setting.BRAND_IDS:
                detail_task_item = (
                    maoyan_film_detail_task_item.MaoyanFilmDetailTaskItem())
                detail_task_item.movie_id = movie_id  # 电影id
                detail_task_item.city_id = request.city_id  # 城市id
                detail_task_item.brand_id = brand_id  # 品牌 -1表示全部
                detail_task_item.show_date = tools.get_current_date(
                    "%Y-%m-%d")  # 日期
                yield detail_task_item

        # 翻页
        if request.page == 1:
            total_page = response.xpath(
                '//ul[@class="list-pager"]//li[last()-1]/a/text()'
            ).extract_first()
            if total_page:
                total_page = int(total_page)
                for page in range(2, total_page + 1):
                    yield feapder.Request(page=page,
                                          city_id=request.city_id,
                                          task_id=request.task_id)

            # 更新任务
            yield self.update_task_batch(request.task_id, 1)