Exemple #1
0
 def from_crawler(cls, crawler):
     # This method is used by Scrapy to create your spiders.
     PrintFormatUtil.print_line("重新定义crawler-spider")
     s = cls()
     # 绑定节点的监听事件
     crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
     crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
     return s
Exemple #2
0
    def process_response(self, request, response, spider):
        PrintFormatUtil.print_line("spider {} : 开始处理 Response".format(spider.name))
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
 def start_requests(self):
     for service_name, service_url in self.link_list.items():
         PrintFormatUtil.print_line("检查{}的页面, url {}".format(
             service_name, service_url))
         args_dict = {'title': service_name}
         yield SeleniumRequest(url=service_url,
                               callback=self.parse,
                               screen_shot=True,
                               wait_time=20,
                               r_dict=args_dict)
Exemple #4
0
 def start_requests(self):
     for p in self.link_list:
         PrintFormatUtil.print_line("检查({}){}的页面, url {}".format(
             p.get_id(), p.get_name(), p.get_url()))
         args_dict = {'title': p.get_name(), 'id': p.get_id()}
         yield SeleniumRequest(url=p.get_url(),
                               callback=self.parse,
                               screen_shot=True,
                               wait_time=20,
                               r_dict=args_dict)
Exemple #5
0
 def process_item(self, item, spider):
     PrintFormatUtil.print_line(self.__module__)
     if spider.name == CONST.ALI_YUN_PRODUCT_SPIDER_NAME:
         result = spider_db.save_product_item_to_db(item)
     if result == 1:
         PrintFormatUtil.print_success("spider %s success ..." %
                                       str(item).strip())
         self.counter[spider.name].add()
     else:
         PrintFormatUtil.print_error("spider %s fail ..." %
                                     str(item).strip())
     return item
Exemple #6
0
    def spider_opened(self, spider):
        PrintFormatUtil.print_line("spider {} : 开始处理".format(spider.name))
        PrintFormatUtil.print_line("spider {} , 运行模式 {}".format(spider.name, spider.crawl_type.value))
        if spider.crawl_type.value == 'selenium':
            chrome_options = Options()
            list([chrome_options.add_argument(x) for x in CONST.CHROME_DRIVER_OPTIONS])
            self.driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CONST.CHROME_DRIVER_BIN_PATH)
        if spider.crawl_type.value == 'puppeeter':
            pyppeteer_level = logging.WARNING
            logging.getLogger('pyppeteer').setLevel(pyppeteer_level)
            logging.getLogger('websockets.protocol').setLevel(pyppeteer_level)

            pyppeteer_logger = logging.getLogger('pyppeteer')
            pyppeteer_logger.setLevel(logging.WARNING)
            self.driver = sync(launch({'Headless': True, 'args': ['--no-sandbox', '--disable-gpu'], 'dumpio': True}))
 def output_diff(self):
     try:
         diff = ImageChops.difference(self.i_a, self.i_b)
         if diff.getbbox():
             PrintFormatUtil.print_line("存在差异, 生成差异图片 {}".format(
                 self.i_diff))
             point_table = ([0] + ([255] * 255))
             diff = diff.convert('L')
             diff = diff.point(point_table)
             new = diff.convert('RGB')
             new.paste(self.i_b, mask=diff)
             new.save(self.i_diff)
     except ValueError as e:
         text = (
             "表示图片大小和box对应的宽度不一致,参考API说明:Pastes another image into this image."
             "The box argument is either a 2-tuple giving the upper left corner, a 4-tuple defining the left, upper, "
             "right, and lower pixel coordinate, or None (same as (0, 0)). If a 4-tuple is given, the size of the pasted "
             "image must match the size of the region.使用2纬的box避免上述问题")
         PrintFormatUtil.print_line("【{0}】{1}".format(e, text))
Exemple #8
0
 def parse(self, response):
     data = json.loads(response.body.decode('utf-8'))
     assert 'product' in data and len(
         data['product']) > 0, "URL{} 数据不符合要求 ".format(response.url)
     for category_1 in data['product']:
         if 'name' in data['product'][category_1]:
             PrintFormatUtil.print_line("处理一级类目{}".format(category_1))
             for service in data['product'][category_1]['name']:
                 item = ProductItem()
                 item['category1'] = service['category1']
                 if 'category2' in service:
                     item['category2'] = service['category2']
                 else:
                     item['category2'] = ''
                 item['title'] = service['title']
                 item['description'] = service['description']
                 item['link'] = service['textLink']
                 if "%s:%s" % (service['category1'],
                               service['title']) not in self.product_list:
                     PrintFormatUtil.print_line("获取子服务{}".format(item))
                     yield item
Exemple #9
0
 def spider_closed(self, spider):
     """Shutdown the driver when spider is closed"""
     PrintFormatUtil.print_line("spider {} : 结束处理".format(spider.name))
     if spider.crawl_type.value == 'selenium' and not self.driver is None:
         PrintFormatUtil.print_line("spider {} : selenium driver 销毁".format(spider.name))
         self.driver.close()
         self.driver.quit()
     if spider.crawl_type.value == 'puppeeter' and not self.driver is None:
         PrintFormatUtil.print_line("spider {} : puppeeter driver 销毁".format(spider.name))
         sync(self.driver.close())
    def parse(self, response):
        service_pic_path = os.path.join(CONST.PIC_PATH,
                                        response.meta['r_dict']['title'])
        os.makedirs(service_pic_path, exist_ok=True)
        current_time = str(int(time.time()))
        service_pic_name = os.path.join(service_pic_path,
                                        current_time + ".png")
        service_pic_small_name = os.path.join(service_pic_path,
                                              current_time + "_s.png")
        service_pic_diff_name = os.path.join(service_pic_path,
                                             current_time + "_diff.png")
        service_pic_oc_diff_name = os.path.join(service_pic_path,
                                                current_time + "_oc_diff.png")
        PrintFormatUtil.print_line("pic save path {}".format(service_pic_name))
        with open(service_pic_name, 'wb') as image_file:
            image_file.write(response.meta['screen_shot'])

        # 设置图片压缩
        image = Image.open(service_pic_name)
        w, h = image.size
        PrintFormatUtil.print_line("原有图片大小 width {} height {}".format(w, h))
        d_img = image.resize((int(w / 2), int(h / 2)), Image.ANTIALIAS)
        w, h = d_img.size
        PrintFormatUtil.print_line("处理后的图片大小 width {} height {}".format(w, h))
        d_img.save(service_pic_small_name, quality=95)
        del response, image

        # 读取latest文件
        latest_path = os.path.join(service_pic_path, 'latest')
        if os.path.exists(latest_path) and os.path.isfile(latest_path):
            with open(latest_path, 'r') as f:
                old_file_info = f.read()
            old_file_info = old_file_info.split(" ")
            old_file_info_name = old_file_info[0]
            old_file_info_md5 = old_file_info[1]
            old_service_pic_name = os.path.join(service_pic_path,
                                                old_file_info_name)
            PrintFormatUtil.print_line(
                "old pic path {}".format(old_service_pic_name))
            if old_file_info_md5 == FileUtil.get_md5(old_service_pic_name):
                PrintFormatUtil.print_line("比对图片 {} | {}".format(
                    service_pic_small_name, old_file_info_name))
                # 比对图片(PIL和OPENCV两种模式)
                iss = ImageSSIM(service_pic_small_name, old_service_pic_name,
                                service_pic_diff_name)
                o_iss = OpenCVSSIM(service_pic_small_name,
                                   old_service_pic_name,
                                   service_pic_oc_diff_name)
                pil_s_code = iss.compare_images()
                oc_s_code = o_iss.compare_images()
                PrintFormatUtil.print_line("PIL库两者的相似度: {}".format(pil_s_code))
                PrintFormatUtil.print_line(
                    "OPEN_CV库两者的相似度: {}".format(oc_s_code))
                # 这个值可以设置(0-1),  1 非常严格
                if pil_s_code < 1 and oc_s_code < 1:
                    iss.output_diff()
                    o_iss.output_diff()
            else:
                PrintFormatUtil.print_line(
                    "old pic md5 error. new {} old {}".format(
                        FileUtil.get_md5(old_service_pic_name),
                        old_file_info_md5))
        # 重新生成latest文件
        with open(latest_path, "w") as file:
            file.write(
                os.path.basename(service_pic_small_name) + " " +
                FileUtil.get_md5(service_pic_small_name))
Exemple #11
0
 def close_spider(self, spider):
     PrintFormatUtil.print_line(self.__class__.__name__)
     total = self.counter[spider.name].total()
     PrintFormatUtil.print_line("total %s record inserted" % total)
     PrintFormatUtil.print_title(" spider %s finished " % spider.name)
Exemple #12
0
 def open_spider(self, spider):
     PrintFormatUtil.print_title(" spider %s started " % spider.name)
     PrintFormatUtil.print_line(self.__class__.__name__)
     self.counter[spider.name] = SpiderCounter()
     self.counter[spider.name].create_counter()
Exemple #13
0
    def process_request(self, request, spider):
        PrintFormatUtil.print_line("spider {} : 开始处理 Request".format(spider.name))
        if spider.crawl_type.value == 'selenium' and isinstance(request, SeleniumRequest):
            self.driver.set_window_size(800, 600)
            self.driver.get(request.url)
            # copy cookie
            for cookie_name, cookie_value in request.cookies.items():
                self.driver.add_cookie(
                    {
                        'name': cookie_name,
                        'value': cookie_value
                    }
                )
            if request.wait_until:
                WebDriverWait(self.driver, request.wait_time).until(request.wait_until)
            if request.screen_shot:
                # Get the actual page dimensions using javascript
                width = self.driver.execute_script(
                    "return Math.max(document.body.scrollWidth, document.body.offsetWidth, "
                    "document.documentElement.clientWidth, document.documentElement.scrollWidth, "
                    "document.documentElement.offsetWidth);")
                height = self.driver.execute_script(
                    "return Math.max(document.body.scrollHeight, document.body.offsetHeight, "
                    "document.documentElement.clientHeight, document.documentElement.scrollHeight, "
                    "document.documentElement.offsetHeight);")
                # resize
                PrintFormatUtil.print_line("reset size {}:{}".format(width, height))
                self.driver.set_window_size(width, height)
                time.sleep(1)
                request.meta['screen_shot'] = self.driver.get_screenshot_as_png()
            if request.script:
                self.driver.execute_script(request.script)
            request.meta['r_dict'] = request.r_dict
            body = str.encode(self.driver.page_source)
            # Expose the driver via the "meta" attribute
            request.meta.update({'driver': self.driver})
            return HtmlResponse(
                self.driver.current_url,
                body=body,
                encoding='utf-8',
                request=request
            )

        if spider.crawl_type.value == 'puppeeter' and isinstance(request, PuppeeterRequest):
            page = sync(self.driver.newPage())
            sync(page.setJavaScriptEnabled(enabled=True))
            sync(page.setUserAgent(
                'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'))
            sync(page.goto(request.url))
            request.meta['r_dict'] = request.r_dict
            body = str.encode(sync(page.content()))
            # Expose the driver via the "meta" attribute
            request.meta.update({'driver': self.driver})
            request.meta.update({'page': page})
            return HtmlResponse(
                request.url,
                body=body,
                encoding='utf-8',
                request=request
            )

        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
Exemple #14
0
 def start_requests(self):
     PrintFormatUtil.print_line("阿里云已有的服务一共{}个".format(
         len(self.product_list)))
     yield Request(CONST.ALI_YUN_PRODUCT_URL,
                   callback=self.parse,
                   headers=CONST.ALI_YUN_HEADER)