Esempio n. 1
0
 def __dl_in_element_style_img(self, soup, url):
     """
     获取到html页面内嵌样式的图片资源
     <xx style='background: url(xxxxx.jpg)'>
     :param soup:
     :param url:
     :return:
     """
     inner_style_node = soup.find_all(
         style=re.compile("url(.*?)"))  # TODO url/URL 大小写
     for style in inner_style_node:
         resource_url = re.findall(
             'url\(.*?\)', style.get("style"))[0]  # TODO 遍历而非取第一个,匹配到全部
         resource_url = self.__get_style_url_link(resource_url)
         if is_inline_resource(resource_url):  # 内嵌base64图片
             continue
         abs_link = get_abs_url(url, resource_url)
         if self.is_ref_model:
             style['style'] = style['style'].replace(resource_url, abs_link)
         elif is_same_web_site_link(url,
                                    abs_link) or self.is_grab_outer_link:
             file_name = get_file_name_from_url(abs_link,
                                                self.file_name_dup_checker,
                                                'png')
             file_save_path = f"{self.__get_img_full_path()}/{file_name}"
             replace_url = f"{self.img_dir}/{file_name}"
             replace_url = replace_url
             style['style'] = style['style'].replace(
                 resource_url, replace_url)
             self.__url_enqueue(abs_link, file_save_path,
                                self.FILE_TYPE_BIN)
         else:
             style['style'] = style['style'].replace(resource_url, abs_link)
Esempio n. 2
0
    def __get_same_site_link(self, soup, url):
        """
        获取soup里的全部页面的url
        :param soup:
        :param url:
        :return:
        """
        new_url = []
        if self.is_full_site:
            a_list = soup.find_all("a")
            if a_list is not None:
                for a in a_list:
                    try:
                        raw_link = a.get("href")
                        if raw_link is None or raw_link.startswith(
                                "#") or not is_page_url(raw_link):
                            continue

                        abs_link = get_abs_url(url, raw_link)  #新产生的url
                        abs_link = format_url(abs_link)
                        a['href'] = abs_link

                        if is_page_url(abs_link) and not is_img_ext(
                                abs_link) and abs_link not in new_url:
                            new_url.append(abs_link)
                    except Exception as e:
                        self.logger.info("%s: %s", a, e)
                        self.logger.exception(e)
                        continue

        return new_url
Esempio n. 3
0
    def __dl_img(self, soup, url):
        """
        下载图片,并替换html里图片的地址
        :param soup:
        :param url:
        :return:
        """
        images = soup.find_all("img")
        for img in images:
            raw_link = img.get("src")
            if raw_link is None or is_inline_resource(
                    raw_link):  # 跳过base64内嵌图片 <img src='data:image...'/>
                continue
            abs_link = get_abs_url(url, raw_link)

            if self.is_ref_model:
                img['src'] = abs_link
            elif is_same_web_site_link(url,
                                       abs_link) or self.is_grab_outer_link:
                file_name = get_file_name_from_url(abs_link,
                                                   self.file_name_dup_checker,
                                                   "png")
                file_save_path = f"{self.__get_img_full_path()}/{file_name}"
                replace_url = f"{self.img_dir}/{file_name}"
                img['src'] = replace_url

                self.__url_enqueue(abs_link, file_save_path,
                                   self.FILE_TYPE_BIN)
            else:  # 修正为绝对链接
                img['src'] = abs_link

            if img.get("crossorigin ") is not None:
                del img['crossorigin ']
            if img.get("integrity") is not None:
                del img['integrity']
Esempio n. 4
0
    async def __dl_link(self, soup, url):
        """
        下载<link>标签里的资源,并替换html里的地址
        :param soup:
        :param url:
        :return:
        """
        css_src = soup.find_all("link")
        if css_src is None:
            return
        for css in css_src:
            raw_link = css.get("href")
            if raw_link is None:
                continue
            abs_link = get_abs_url(url, raw_link)

            if self.is_ref_model:
                css['href'] = abs_link
            elif is_same_web_site_link(
                    url, abs_link) or self.is_grab_outer_link:  # 控制是否抓外链资源
                file_name = get_file_name_from_url(abs_link,
                                                   self.file_name_dup_checker,
                                                   'css')

                if is_img_ext(file_name):
                    file_save_path = f"{self.__get_img_full_path()}/{file_name}"
                    replace_url = f"{self.img_dir}/{file_name}"
                    self.__url_enqueue(abs_link, file_save_path,
                                       self.FILE_TYPE_BIN)
                else:
                    file_save_path = f"{self.__get_css_full_path()}/{file_name}"
                    replace_url = f"{self.css_dir}/{file_name}"
                    if not self.__is_dup(abs_link, file_save_path):
                        resp_text, _ = await self.__async_get_request_text(
                            abs_link, force_as_text=True)
                        if resp_text is not None:
                            text_content = resp_text
                            text_content = await self.__replace_and_grab_css_url(
                                abs_link, text_content)
                            self.__set_dup_url(abs_link, file_save_path)
                            await self.__async_save_text_file(
                                text_content, file_save_path)  # 存储css文件
                        else:
                            self.__log_error_resource(abs_link, file_save_path)

                css['href'] = replace_url
            else:  # 修正link url为绝对链接地址
                css['href'] = abs_link

            # 将跨域锁定和来源校验关闭
            if css.get("crossorigin") is not None:
                del css['crossorigin']
            if css.get('integrity') is not None:
                del css['integrity']
Esempio n. 5
0
 async def __process_in_html_css_resource(self, soup, url):
     style_css = soup.find_all("style")
     if style_css is None:
         return
     for style in style_css:
         css_content = style.text
         if css_content is None:
             continue
         all_urls = re.findall('url\(.*?\)', css_content)
         if all_urls is not None:
             for raw_u in all_urls:
                 u = self.__get_style_url_link(
                     raw_u)  # > url('xxx') or url(xxx)
                 if is_inline_resource(u):  # 内嵌base64图片
                     continue
                 abs_link = get_abs_url(url, u)
                 if self.is_ref_model:
                     css_content = css_content.replace(
                         raw_u, f'url({abs_link})')
                 elif is_same_web_site_link(
                         url, abs_link) or self.is_grab_outer_link:
                     file_name = get_file_name_from_url(
                         abs_link, self.file_name_dup_checker)
                     if is_img_ext(file_name):
                         file_save_path = f"{self.__get_img_full_path()}/{file_name}"
                         replace_url = f"{self.img_dir}/{file_name}"
                     else:
                         file_save_path = f"{self.__get_css_full_path()}/{file_name}"
                         replace_url = f"{self.css_dir}/{file_name}"
                     css_content = css_content.replace(
                         raw_u, f'url({replace_url})')
                     self.__url_enqueue(abs_link, file_save_path,
                                        self.FILE_TYPE_BIN)
                 else:  # 替换,特别是以//开头那种url
                     css_content = css_content.replace(
                         raw_u, f'url({abs_link})')
             style.string = css_content
Esempio n. 6
0
    def __dl_js(self, soup, url):
        """
        下载js,替换html里js文件的地址
        :param soup:
        :param url:
        :return:
        """
        scripts_urls = soup.find_all("script")
        for scripts in scripts_urls:
            raw_link = scripts.get("src")
            if raw_link is None:
                continue
            abs_link = get_abs_url(url, raw_link)

            if self.is_ref_model:
                scripts['src'] = abs_link
            elif is_same_web_site_link(url,
                                       abs_link) or self.is_grab_outer_link:
                """
                如果是外链引入的js就不管了,除非打开了开关
                """
                file_name = get_file_name_from_url(abs_link,
                                                   self.file_name_dup_checker,
                                                   "js")
                file_save_path = f"{self.__get_js_full_path()}/{file_name}"
                replace_url = f"{self.js_dir}/{file_name}"
                scripts['src'] = replace_url
                self.__url_enqueue(abs_link, file_save_path,
                                   self.FILE_TYPE_TEXT)
                # 将跨域锁定和来源校验关闭
                if scripts.get("crossorigin") is not None:
                    del scripts['crossorigin']
                if scripts.get('integrity') is not None:
                    del scripts['integrity']
            else:
                scripts['src'] = abs_link
Esempio n. 7
0
    async def __replace_and_grab_css_url(self, url, text):
        """
        @import url(font-awesome.min.css);
        @import "https://fonts.googleapis.com/css?family=Montserrat:700|Open+Sans:300,400|Pacifico";
        :param url:
        :param text:
        :return:
        """
        urls = re.findall("url\(.*?\)", text)  # TODO 区分大小写
        for u in urls:
            relative_u = self.__get_style_url_link(u)
            if is_inline_resource(relative_u):  # 内嵌base64图片
                continue

            abs_link = get_abs_url(url, relative_u)
            if relative_u.endswith(
                    "css"):  # 这一步把带有 @import url(xx)的忽略掉,防止和下一步重合
                self.logger.warning("skip css file, grab in the next step: %s",
                                    abs_link)
                continue

            if self.is_grab_outer_link:  # 控制是否抓外链资源,只要抓外部资源,那么css里的全部资源都要无条件抓进来而不管是不是一个同站点的
                file_name = get_file_name_from_url(abs_link,
                                                   self.file_name_dup_checker,
                                                   'css')
                is_img = is_img_ext(file_name)
                if is_img:
                    file_save_path = f"{self.__get_img_full_path()}/{file_name}"
                    replace_url = f"../{self.img_dir}/{file_name}"
                    self.__url_enqueue(abs_link, file_save_path,
                                       self.FILE_TYPE_BIN)
                    text = text.replace(relative_u, replace_url)
                else:
                    file_save_path = f"{self.__get_css_full_path()}/{file_name}"
                    replace_url = f"{file_name}"  # 由于是相对于css文件的引入,因此是平级关系, 如果是图片就需要从../img目录下
                    self.__url_enqueue(abs_link, file_save_path,
                                       self.FILE_TYPE_BIN)
                    text = text.replace(relative_u, replace_url)

        imported_css = re.findall('@import\s+["\']+(.*?)["\']',
                                  text)  # css里 @import的情况
        imported_css2 = re.findall(
            '@import\s+url\(.*?\)',
            text)  # > ['@import url(font-awesome.min.css)']
        if imported_css2 is not None:
            for x in imported_css2:
                x = self.__get_style_url_link(x.split()[1])
                imported_css.append(x)

        if imported_css:
            for u in imported_css:
                if u.startswith(("http", "https")):
                    abs_link = u
                else:
                    abs_link = get_abs_url(url, u)

                file_name = get_file_name_from_url(abs_link,
                                                   self.file_name_dup_checker,
                                                   'css')
                file_save_path = f"{self.__get_css_full_path()}/{file_name}"
                if not self.__is_dup(abs_link, file_save_path):
                    resp_text, _ = await self.__async_get_request_text(abs_link
                                                                       )

                    if resp_text is not None:
                        text_content = resp_text
                        text_content = await self.__replace_and_grab_css_url(
                            abs_link, text_content)
                        self.__set_dup_url(abs_link, file_save_path)
                        # 在存储之前要把返回的css里的@import后面内容替换掉
                        text_content = text_content.replace(u, file_name)
                        await self.__async_save_text_file(
                            text_content, file_save_path)  # 存储css文件
                    else:
                        self.__log_error_resource(url, file_save_path)  # 下载失败的

        return text