コード例 #1
0
    def parse_frame_dom_resources(self, data):  # noqa
        # type: (Dict) -> RGridDom
        base_url = data["url"]
        resource_urls = data.get("resourceUrls", [])
        all_blobs = data.get("blobs", [])
        frames = data.get("frames", [])
        logger.debug("""
        parse_frame_dom_resources() call

        base_url: {base_url}
        count blobs: {blobs_num}
        count resource urls: {resource_urls_num}
        count frames: {frames_num}

        """.format(
            base_url=base_url,
            blobs_num=len(all_blobs),
            resource_urls_num=len(resource_urls),
            frames_num=len(frames),
        ))

        def find_child_resource_urls(content_type, content, resource_url):
            # type: (Optional[Text], bytes, Text) -> NoReturn
            logger.debug("find_child_resource_urls({0}, {1}) call".format(
                content_type, resource_url))
            if not content_type:
                logger.debug(
                    "content_type is empty. Skip handling of resources")
                return []
            return [
                apply_base_url(url, base_url, resource_url)
                for url in collect_urls_from_(content_type, content)
            ]

        frame_request_resources = {}
        for f_data in frames:
            f_data["url"] = apply_base_url(f_data["url"], base_url)
            frame_request_resources[f_data[
                "url"]] = self.parse_frame_dom_resources(f_data).resource

        urls_to_fetch = set(resource_urls)
        for blob in all_blobs:
            resource = VGResource.from_blob(blob, find_child_resource_urls)
            if resource.url.rstrip("#") == base_url:
                continue
            frame_request_resources[resource.url] = resource
            urls_to_fetch |= set(resource.child_resource_urls)

        resources_and_their_children = fetch_resources_recursively(
            urls_to_fetch,
            self.server_connector,
            self.resource_cache,
            find_child_resource_urls,
        )
        frame_request_resources.update(resources_and_their_children)
        self.full_request_resources.update(frame_request_resources)
        return RGridDom(url=base_url,
                        dom_nodes=data["cdt"],
                        resources=frame_request_resources)
コード例 #2
0
 def handle_resources(content_type, content, resource_url):
     # type: (Optional[Text], bytes, Text) -> NoReturn
     logger.debug("handle_resources({0}, {1}) call".format(
         content_type, resource_url))
     if not content_type:
         logger.debug(
             "content_type is empty. Skip handling of resources")
         return
     for url in collect_urls_from_(content_type, content):
         target_url = apply_base_url(url, base_url, resource_url)
         discovered_resources_urls.add(target_url)
コード例 #3
0
 def find_child_resource_urls(content_type, content, resource_url):
     # type: (Optional[Text], bytes, Text) -> NoReturn
     logger.debug("find_child_resource_urls({0}, {1}) call".format(
         content_type, resource_url))
     if not content_type:
         logger.debug(
             "content_type is empty. Skip handling of resources")
         return []
     return [
         apply_base_url(url, base_url, resource_url)
         for url in collect_urls_from_(content_type, content)
     ]
コード例 #4
0
ファイル: render_task.py プロジェクト: brent-brown/CFAHome
 def handle_resources(content_type, content, resource_url):
     logger.debug("handle_resources({0}, {1}) call".format(
         content_type, resource_url))
     urls_from_css, urls_from_svg = [], []
     if content_type.startswith("text/css"):
         urls_from_css = parsers.get_urls_from_css_resource(content)
     if content_type.startswith("image/svg"):
         urls_from_svg = parsers.get_urls_from_svg_resource(content)
     for discovered_url in urls_from_css + urls_from_svg:
         if discovered_url.startswith(
                 "data:") or discovered_url.startswith("#"):
             # resource already in blob or not relevant
             continue
         target_url = apply_base_url(discovered_url, base_url,
                                     resource_url)
         with discovered_resources_lock:
             discovered_resources_urls.add(target_url)
コード例 #5
0
    def parse_frame_dom_resources(self, data):  # noqa
        # type: (Dict) -> RGridDom
        base_url = data["url"]
        resource_urls = data.get("resourceUrls", [])
        all_blobs = data.get("blobs", [])
        frames = data.get("frames", [])
        logger.debug("""
        parse_frame_dom_resources() call

        base_url: {base_url}
        count blobs: {blobs_num}
        count resource urls: {resource_urls_num}
        count frames: {frames_num}

        """.format(
            base_url=base_url,
            blobs_num=len(all_blobs),
            resource_urls_num=len(resource_urls),
            frames_num=len(frames),
        ))
        frame_request_resources = {}
        discovered_resources_urls = set()

        def handle_resources(content_type, content, resource_url):
            # type: (Optional[Text], bytes, Text) -> NoReturn
            logger.debug("handle_resources({0}, {1}) call".format(
                content_type, resource_url))
            if not content_type:
                logger.debug(
                    "content_type is empty. Skip handling of resources")
                return
            for url in collect_urls_from_(content_type, content):
                target_url = apply_base_url(url, base_url, resource_url)
                discovered_resources_urls.add(target_url)

        def get_resource(link):
            # type: (Text) -> VGResource
            logger.debug("get_resource({0}) call".format(link))
            response = self.eyes_connector.download_resource(link)
            return VGResource.from_response(link,
                                            response,
                                            on_created=handle_resources)

        for f_data in frames:
            f_data["url"] = apply_base_url(f_data["url"], base_url)
            frame_request_resources[f_data[
                "url"]] = self.parse_frame_dom_resources(f_data).resource

        for blob in all_blobs:
            resource = VGResource.from_blob(blob, on_created=handle_resources)
            if resource.url.rstrip("#") == base_url:
                continue
            frame_request_resources[resource.url] = resource

        for r_url in set(resource_urls).union(discovered_resources_urls):
            self.resource_cache.fetch_and_store(r_url, get_resource)
        self.resource_cache.process_all()

        # some discovered urls becomes available only after resources processed
        for r_url in discovered_resources_urls:
            self.resource_cache.fetch_and_store(r_url, get_resource)

        for r_url in set(resource_urls).union(discovered_resources_urls):
            val = self.resource_cache[r_url]
            if val is None:
                logger.debug("No response for {}".format(r_url))
                continue
            frame_request_resources[r_url] = val
        self.full_request_resources.update(frame_request_resources)
        return RGridDom(url=base_url,
                        dom_nodes=data["cdt"],
                        resources=frame_request_resources)
コード例 #6
0
def test_apply_base_url(discovered_url, site_base_url, resource_url, result):
    assert apply_base_url(discovered_url, site_base_url,
                          resource_url) == result
コード例 #7
0
ファイル: render_task.py プロジェクト: brent-brown/CFAHome
    def parse_frame_dom_resources(self, data):  # noqa
        # type: (Dict) -> RGridDom
        logger.debug("parse_frame_dom_resources() call")
        base_url = data["url"]
        resource_urls = data.get("resourceUrls", [])
        all_blobs = data.get("blobs", [])
        frames = data.get("frames", [])
        discovered_resources_urls = set()
        discovered_resources_lock = Lock()

        def handle_resources(content_type, content, resource_url):
            logger.debug("handle_resources({0}, {1}) call".format(
                content_type, resource_url))
            urls_from_css, urls_from_svg = [], []
            if content_type.startswith("text/css"):
                urls_from_css = parsers.get_urls_from_css_resource(content)
            if content_type.startswith("image/svg"):
                urls_from_svg = parsers.get_urls_from_svg_resource(content)
            for discovered_url in urls_from_css + urls_from_svg:
                if discovered_url.startswith(
                        "data:") or discovered_url.startswith("#"):
                    # resource already in blob or not relevant
                    continue
                target_url = apply_base_url(discovered_url, base_url,
                                            resource_url)
                with discovered_resources_lock:
                    discovered_resources_urls.add(target_url)

        def get_resource(link):
            # type: (Text) -> VGResource
            logger.debug("get_resource({0}) call".format(link))
            response = self.eyes_connector.download_resource(link)
            return VGResource.from_response(link,
                                            response,
                                            on_created=handle_resources)

        for f_data in frames:
            f_data["url"] = apply_base_url(f_data["url"], base_url)
            self.request_resources[f_data[
                "url"]] = self.parse_frame_dom_resources(f_data).resource

        for blob in all_blobs:
            resource = VGResource.from_blob(blob, on_created=handle_resources)
            if resource.url.rstrip("#") == base_url:
                continue
            self.request_resources[resource.url] = resource

        for r_url in set(resource_urls).union(discovered_resources_urls):
            self.resource_cache.fetch_and_store(r_url, get_resource)
        self.resource_cache.process_all()

        # some discovered urls becomes available only after resources processed
        for r_url in discovered_resources_urls:
            self.resource_cache.fetch_and_store(r_url, get_resource)

        for r_url in set(resource_urls).union(discovered_resources_urls):
            val = self.resource_cache[r_url]
            if val is None:
                logger.debug("No response for {}".format(r_url))
                continue
            self.request_resources[r_url] = self.resource_cache[r_url]
        return RGridDom(url=base_url,
                        dom_nodes=data["cdt"],
                        resources=self.request_resources)