Python Match.fix_html Examples

Programming Language: Python

Namespace/Package Name: src.tools.match

Class/Type: Match

Method/Function: fix_html

Examples at hotexamples.com: 9

Python Match.fix_html - 9 examples found. These are the top rated real world Python examples of src.tools.match.Match.fix_html extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

fix_html(5)

fix_filename(5)

get_website_kind(3)

column(3)

create_img_element_with_file_name(3)

csdnblog_author(2)

sinablog_author(2)

jianshu_author(2)

get_url_kind(2)

SinaBlog_profile(2)

SinaBlog(2)

create_local_img_src(2)

article(2)

answer(2)

author(2)

collection(2)

huxiu(1)

doc360(1)

replace_words(1)

jianshu_notebooks(1)

jianshu_collection(1)

fiel(1)

isUrlOk(1)

huawei(1)

cnblogs_author(1)

html_body(1)

detect_recipe_kind(1)

avatar_create_img_element_with_file_name(1)

get_recipe_kind(1)

generate_img_src(1)

format_avatar(1)

sinablog_profile(1)

Example #1

Show file

    def fix_image(self, content):
        content = Match.fix_html(content)
        for img in re.findall(r'<img[^>]*', content):
            # fix img
            # if img[-1] == '/':
            #     print u"修改前,img为:" + str(img)
            #     img = img[:-1]
            #     print u"修改后,img为:" + str(img[:-1])
            img += '>'
            src = re.search(r'(?<=src=").*?(?=")', img)
            if not src:
                new_image = img + '</img>'
                content = content.replace(img, new_image)
                continue
            else:
                src = src.group(0)
                if src.replace(' ', '') == '':
                    new_image = img + '</img>'
                    content = content.replace(img, new_image)
                    continue
            src_download = HtmlCreator.fix_image_src(src)
            if src_download:
                filename = self.image_container.add(src_download)
            else:
                filename = ''
            new_image = img.replace('"{}"'.format(src),
                                    '"../images/{}"'.format(filename))
            new_image = new_image.replace('http://simg.sinajs.cn/blog7style/images/common/sg_trans.gif',\
                                          '../images/{}'.format(filename)) # 硬编码, 可以优化?写到fix_html函数中
            # new_image += '</img>'
            content = content.replace(
                img,
                '<div class="duokan-image-single">{}</div>'.format(new_image))

        return content

Example #2

Show file

File: html_creator.py Project: daoli/ZhihuHelp__Python

    def fix_image(self, content):
        content = Match.fix_html(content)
        for img in re.findall(r'<img[^>]*', content):
            # fix img
            if img[-1] == '/':
                img = img[:-1]
            img += '>'

            src = re.search(r'(?<=src=").*?(?=")', img)
            if not src:
                new_image = img + '</img>'
                content = content.replace(img, new_image)
                continue
            else:
                src = src.group(0)
                if src.replace(' ', '') == '':
                    new_image = img + '</img>'
                    content = content.replace(img, new_image)
                    continue
            src_download = HtmlCreator.fix_image_src(src)
            if src_download:
                filename = self.image_container.add(src_download)
            else:
                filename = ''
            new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename))
            new_image = new_image.replace('//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg',
                                          '../images/{}'.format(filename))
            new_image += '</img>'
            content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image))

        return content

Example #3

Show file

File: html_creator.py Project: zuiwan/ZhihuHelp

    def fix_image(self, content):
        content = Match.fix_html(content)
        for img in re.findall(r'<img[^>]*', content):
            # fix img
            if img[-1] == '/':
                img = img[:-1]
            img += '>'

            src = re.search(r'(?<=src=").*?(?=")', img)
            if not src:
                new_image = img + '</img>'
                content = content.replace(img, new_image)
                continue
            else:
                src = src.group(0)
                if src.replace(' ', '') == '':
                    new_image = img + '</img>'
                    content = content.replace(img, new_image)
                    continue
            src_download = HtmlCreator.fix_image_src(src)
            if src_download:
                filename = self.image_container.add(src_download)
            else:
                filename = ''
            new_image = img.replace('"{}"'.format(src),
                                    '"../images/{}"'.format(filename))
            new_image = new_image.replace(
                '//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg',
                '../images/{}'.format(filename))
            new_image += '</img>'
            content = content.replace(
                img,
                '<div class="duokan-image-single">{}</div>'.format(new_image))

        return content

Example #4

Show file

File: page_worker.py Project: gitter-badger/EE-Book

 def worker(self, target_url):
     if target_url in self.work_complete_set:
         # 自动跳过已抓取成功的网址
         return
     Debug.logger.info(u'开始抓取{}的内容'.format(target_url))
     content = Http.get_content(target_url)
     if not content:
         return
     from src.worker.sinablog_worker import sinablogAuthorWorker
     if isinstance(self, sinablogAuthorWorker):
         content = Match.fix_html(content=content, recipe_kind='sinablog_author')
     else:
         content = Match.fix_html(content=content)  # 需要修正其中的<br>标签，避免爆栈
     self.content_list.append(content)
     Debug.logger.debug(u'{}的内容抓取完成'.format(target_url))
     self.work_complete_set.add(target_url)
     return

Example #5

Show file

File: simple_answer.py Project: zuiwan/ZhihuHelp

 def set_dom(self, dom):
     self.info = {}
     if dom and not (dom.select('div.answer-status')):
         self.header = dom.find('div', class_='zm-item-vote-info')
         self.body = dom.find('textarea', class_='content')
         self.footer = dom.find('div', class_='zm-meta-panel')
         if self.body:
             content = self.get_tag_content(self.body)
             self.content = BeautifulSoup(Match.fix_html(content), 'html.parser')
         self.author_parser.set_dom(dom)
     return

Example #6

Show file

File: html_creator.py Project: mozii/EE-Book

    def fix_image(self, content, recipe):
        content = Match.fix_html(content=content, recipe_kind=recipe)
        for img in re.findall(r'<img[^>]*', content):
            if recipe not in [Type.sinablog_author, Type.cnblogs_author]:
                # fix img
                if img[-1] == '/':
                    img = img[:-1]
            img += '>'
            src = re.search(r'(?<=src=").*?(?=")', img)
            if not src:
                new_image = img + '</img>'
                content = content.replace(img, new_image)
                continue
            else:
                src = src.group(0)
                if src.replace(' ', '') == '':
                    new_image = img + '</img>'
                    content = content.replace(img, new_image)
                    continue
            src_download = HtmlCreator.fix_image_src(src)
            if src_download:
                if recipe in Type.zhihu and not src_download.startswith('http'):
                    # fix zhuanlan image href
                    src_download = src_download.split('.')[0]
                    filename = self.image_container.add('https://pic2.zhimg.com/'+src_download+'_b.jpg')
                elif recipe in Type.generic:
                    filename = ''    # TODO
                else:
                    filename = self.image_container.add(src_download)
            else:
                filename = ''
            new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename))

            if recipe in Type.jianshu:
                new_image = new_image.replace('data-original-src', 'temppicsr')
                new_image = new_image.replace('src', 'falsesrc')
                new_image = new_image.replace('temppicsr', 'src')    # 应该有更好的方式, 暂时先这样写
                new_image += '</img>'
            elif recipe in Type.sinablog:
                # 硬编码, 可以优化?写到fix_html函数中
                new_image = new_image.replace('http://simg.sinajs.cn/blog7style/images/common/sg_trans.gif',\
                                          '../images/{}'.format(filename))
            elif recipe in Type.zhihu:
                new_image = new_image.replace('//zhstatic.zhihu.com/assets/zhihu/ztext/whitedot.jpg',
                                              '../images/{}'.format(filename))
                new_image += '</img>'
            elif recipe in Type.cnblogs:
                pass
            content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image))

        return content

Example #7

Show file

    def worker(self, target_url):
        if target_url in self.work_complete_set:
            # 自动跳过已抓取成功的网址
            return

        Debug.logger.info(u'开始抓取{}的内容'.format(target_url))
        content = Http.get_content(target_url)
        if not content:
            return
        content = Match.fix_html(content)  # 需要修正其中的<br>标签，避免爆栈
        self.content_list.append(content)
        Debug.logger.debug(u'{}的内容抓取完成'.format(target_url))
        self.work_complete_set.add(target_url)
        return

Example #8

Show file

File: worker.py Project: FengWenPei/ZhihuHelp

    def worker(self, target_url):
        if target_url in self.work_complete_set:
            # 自动跳过已抓取成功的网址
            return

        Debug.logger.info(u'开始抓取{}的内容'.format(target_url))
        content = Http.get_content(target_url)
        if not content:
            return
        content = Match.fix_html(content)  # 需要修正其中的<br>标签，避免爆栈
        self.content_list.append(content)
        Debug.logger.debug(u'{}的内容抓取完成'.format(target_url))
        self.work_complete_set.add(target_url)
        return

Example #9

Show file

    def fix_image(self, content):
        content = Match.fix_html(content)
        for img in re.findall(r'<img[^>]*', content):
            # fix img
            if img[-1] == '/':
                # print u"修改前,img为:" + str(img)
                img = img[:-1]
                # print u"修改后,img为:" + str(img)
            img += '>'
            src = re.search(r'(?<=src=").*?(?=")', img)
            if not src:
                new_image = img + '</img>'
                content = content.replace(img, new_image)
                continue
            else:
                src = src.group(0)
                if src.replace(' ', '') == '':
                    new_image = img + '</img>'
                    content = content.replace(img, new_image)
                    continue
            src_download = HtmlCreator.fix_image_src(src)
            if src_download:
                filename = self.image_container.add(src_download)
            else:
                filename = ''
            # print u"src是什么?????" + str(src)
            new_image = img.replace('"{}"'.format(src), '"../images/{}"'.format(filename))
            new_image = new_image.replace('data-original-src', 'temppicsr')
            new_image = new_image.replace('src', 'falsesrc')
            new_image = new_image.replace('temppicsr', 'src')    # 应该有更好的方式, 暂时先这样写

            # new_image = new_image.replace('"{}"'.format(src+'/w/1240'), '"./images/{}"'.format(filename))
            # new_image = new_image.replace('"{}"'.format(src), '"./images/{}"'.format(filename))

            new_image += '</img>'
            content = content.replace(img, '<div class="duokan-image-single">{}</div>'.format(new_image))

        return content