Python Tag 예제들, bs4.Tag Python 예제들

예제 #1

0

파일 보기

def proc_hr(fig, img):
  if MATERIAL_HR:
    hr = bs4.Tag(name="hr")
    fig.replace_with(hr)
  else:
    url = "https:" + img["data-src"]
    name = "dividers/{}.png".format(url.split("/")[-1])
    if not os.path.exists(path(name)):
      print("下载分割线", url)
      download(url, name)
    hr = bs4.Tag(name="img", attrs={"src": url, "hr": None})
    fig.replace_with(hr)

예제 #2

0

파일 보기

 def initEmptyMetalink(self):
     self.xml = bs4.BeautifulSoup('<?xml version="1.0" encoding="utf-8"?>',
                                  "xml")
     self.metalink = bs4.Tag(name="metalink")
     self.metalink["xmlns"] = "urn:ietf:params:xml:ns:metalink"
     generator = bs4.Tag(name="generator")
     generator.append("downloaders python library")
     self.metalink.append("\n")
     self.metalink.append(generator)
     self.metalink.append("\n")
     self.xml.append(self.metalink)
     return self.xml

예제 #3

0

파일 보기

파일: glossary.py 프로젝트: JackMcKew/jackmckew.dev

def parse_content(content):
    soup = bs4.BeautifulSoup(content._content, "html.parser")

    for def_list in soup.find_all("dl"):
        defns = []
        for def_title in def_list.find_all("dt"):
            if def_title.text not in Definitions.exclude:
                anchor_name = make_anchor(def_title)
                anchor_tag = bs4.Tag(name="a", attrs={"name": anchor_name})
                index = def_list.parent.index(def_list) - 1
                def_list.parent.insert(index, anchor_tag)

                defns.append(
                    {
                        "title": make_title(def_title),
                        "definition": make_def(def_title),
                        "anchor": anchor_name,
                        "source": content,
                    }
                )

        for defn in defns:
            defn["see_also"] = [d for d in defns if d is not defn]

        Definitions.definitions += defns

    content._content = str(soup)

예제 #4

0

파일 보기

파일: HTMLTableSectionElement.py 프로젝트: xianlimei/thug

    def insertRow(self, index=None):
        # `index' specifies the position of the row to insert (starts at 0). The value of
        # -1 can also be used; which result in that the new row will be inserted at the
        # last position. This parameter is required in Firefox and Opera, but optional in
        # Internet Explorer, Chrome and Safari. If this parameter is omitted, insertRow()
        # inserts a new row at the last position in IE and at the first position in Chrome
        # and Safari.
        if index is None:
            if log.ThugOpts.Personality.isIE():
                index = -1
            if log.ThugOpts.Personality.isChrome(
            ) or log.ThugOpts.Personality.isSafari():
                index = 0

        row = HTMLTableRowElement(self.doc,
                                  BeautifulSoup.Tag(self.doc, name='tr'))

        if index in (
                -1,
                len(self._rows),
        ):
            self.rows.nodes.append(row)
        else:
            self.rows.nodes.insert(index, row)

        return row

예제 #5

0

파일 보기

    def __collectTextElements(self):
        """Return all elements containing parts of chapter text (which may be
        <p>aragraphs, <div>isions or plain text nodes) under a single root."""
        starter = self._document.find('div', {'itemprop': 'articleBody'})
        if starter is None:
            # FIXME: This will occur if the method is called more than once.
            # The reason is elements appended to `root' are removed from the document.
            # BS 4.4 implements cloning via `copy.copy()', but supporting it for BS 4.3
            # would be error-prone (due to relying on BS internals) and is not needed.
            if self._textElement:
                _logger.debug(
                    u"You may not call this function more than once!")
            raise ParsingError(u'Failed to locate text.')
        collection = [starter]
        for element in starter.childGenerator():
            if element is None:
                break
            collection.append(element)
        root = bs4.Tag(name='td')
        for element in collection:
            root.append(element)

        if self._configuration['excludeEditorSignature']:
            root = self._excludeEditorSignature(root)

        return root

예제 #6

0

파일 보기

    def createCaption(self):
        if self._caption:
            return self._caption

        self._caption = HTMLTableCaptionElement(
            self.doc, BeautifulSoup.Tag(self.doc, name='caption'))
        return self._caption

예제 #7

0

파일 보기

파일: HTMLTableElement.py 프로젝트: rohan-gulati/thug

    def createTHead(self):
        if self._tHead:
            return self._tHead

        self._tHead = HTMLTableSectionElement(self.doc, BeautifulSoup.Tag(self.doc, name = 'thead'))
        self.rows.nodes.insert(0, self._tHead)
        return self._tHead

예제 #8

0

파일 보기

파일: glossary.py 프로젝트: Alephsa/alephsa-blog

def parse_content(content):
    soup = bs4.BeautifulSoup(content._content, 'html.parser')

    for def_list in soup.find_all('dl'):
        defns = []
        for def_title in def_list.find_all('dt'):
            if def_title.text not in Definitions.exclude:
                anchor_name = make_anchor(def_title)
                anchor_tag = bs4.Tag(name="a", attrs={'name': anchor_name})
                index = def_list.parent.index(def_list) - 1
                def_list.parent.insert(index, anchor_tag)

                defns.append({
                    'title': make_title(def_title),
                    'definition': make_def(def_title),
                    'anchor': anchor_name,
                    'source': content
                })

        for defn in defns:
            defn['see_also'] = [d for d in defns if d is not defn]

        Definitions.definitions += defns

    content._content = str(soup)

예제 #9

0

파일 보기

파일: HTMLTableElement.py 프로젝트: rohan-gulati/thug

    def createTFoot(self):
        if self._tFoot:
            return self._tFoot

        self._tFoot = HTMLTableSectionElement(self.doc, BeautifulSoup.Tag(self.doc, name = 'tfoot'))
        self.rows.nodes.append(self._tFoot)
        return self._tFoot

예제 #10

0

파일 보기

파일: HTMLTableElement.py 프로젝트: rohan-gulati/thug

    def insertRow(self, index = None):
        # Insert a new empty row in the table. The new row is inserted immediately before
        # and in the same section as the current indexth row in the table. If index is -1
        # or equal to the number of rows, the new row is appended. In addition, when the
        # table is empty the row is inserted into a TBODY which is created and inserted
        # into the table.

        # `index' specifies the position of the row to insert (starts at 0). The value of
        # -1 can also be used; which result in that the new row will be inserted at the
        # last position. This parameter is required in Firefox and Opera, but optional in
        # Internet Explorer, Chrome and Safari. If this parameter is omitted, insertRow()
        # inserts a new row at the last position in IE and at the first position in Chrome
        # and Safari.
        if index is None:
            if log.ThugOpts.Personality.isIE():
                index = -1
            if log.ThugOpts.Personality.isChrome() or log.ThugOpts.Personality.isSafari():
                index = 0

        # PLEASE REVIEW ME!
        if not len(self.tBodies):
            tBody = HTMLTableSectionElement(self.doc, BeautifulSoup.Tag(self.doc, name = 'tbody'))
            self.tBodies.nodes.append(tBody)
            if self.tFoot is None:
                self.rows.nodes.append(tBody)
            else:
                self.rows.nodes.insert(-2, tBody)
        else:
            tBody = self.tBodies[-1]

        row = tBody.insertRow(index)
        return row

예제 #11

0

파일 보기

파일: w3c.py 프로젝트: sumsung007/xspider

    def setter(self, value):
        tag = self.doc

        for part in parts:
            if part == '':
                continue
            elif part == 'text()':
                if tag.string:
                    tag.contents[0] = bs4.NavigableString(value)
                else:
                    tag.append(value)

                tag.string = tag.contents[0]

                return
            else:
                child = tag.find(part)

                if not child:
                    child = bs4.Tag(self.doc, part)

                    tag.append(child)

                tag = child

        tag.append(value)

예제 #12

0

파일 보기

파일: cppref.py 프로젝트: davfsa/nekosquared

    async def get_information(cls, href):
        """
        Gets information for the given search result.
        """
        url = base_cppr + href
        conn = await cls.acquire_http()
        response = await conn.get(url)
        # Make soup.
        bs = bs4.BeautifulSoup(await response.text())

        header = bs.find(name="tr", attrs={"class": "t-dsc-header"})
        if header:
            header = header.text
        else:
            header = ""

        taster_tbl: bs4.Tag = bs.find(name="table", attrs={"class": "t-dcl-begin"})

        if taster_tbl:
            tasters = taster_tbl.find_all(
                name="span",
                attrs={"class": lambda c: c is not None and "mw-geshi" in c},
            )

            if tasters:
                # Fixes some formatting
                for i, taster in enumerate(tasters):
                    taster = taster.text.split("\n")
                    taster = "\n".join(t.rstrip() for t in taster)
                    taster = taster.replace("\n\n", "\n")
                    tasters[i] = taster

            # Remove tasters from DOM
            taster_tbl.replace_with(bs4.Tag(name="empty"))
        else:
            tasters = []

        h1 = bs.find(name="h1").text

        # Get the description
        desc = bs.find(name="div", attrs={"id": "mw-content-text"})

        if desc:
            # first_par_node = desc.find(name='p')
            # description = first_par_node.text + '\n'
            # sibs = first_par_node.find_next_siblings()
            # for sib in sibs:
            #    description += sib.text + '\n'
            description = "\n".join(
                p.text
                for p in desc.find_all(name="p")
                if not p.text.strip().endswith(":")
                and not p.text.strip().startswith("(")
                and not p.text.strip().endswith(")")
            )
        else:
            description = ""

        return url, h1, tasters, header, description

예제 #13

0

파일 보기

    def createElement(self, tagname):
        # bs4.Tag(builder = None)
        element = DOMImplementation.createHTMLElement(self, bs4.Tag(self.doc, None, tagname))

        if self.onCreateElement:
            self.onCreateElement(element)

        return element

예제 #14

0

파일 보기

파일: w3c.py 프로젝트: sumsung007/xspider

    def createElement(self, tagname):
        element = DOMImplementation.createHTMLElement(
            self, bs4.Tag(parent=None, name=tagname))

        if self.onCreateElement:
            self.onCreateElement(element)

        return element

예제 #15

0

파일 보기

    def __init__(self, doc, parent, attr):
        self.doc = doc
        self.parent = parent
        self.attr = attr
        self.tag = BeautifulSoup.Tag(parser=self.doc, name='attr')
        Node.__init__(self, doc)

        self._value = self.getValue()

예제 #16

0

파일 보기

    def generateMetalinkFileNodeFromTarget(self, target):
        fileName = target.fsPath.name
        file = self.metalink.select_one("file", name=fileName)

        if file is None:
            file = bs4.Tag(name="file")
            file["name"] = target.fsPath.name
        self.metalink.append("\n")
        self.metalink.append(file)
        self.metalink.append("\n")

        for uri in target.uris:
            url = bs4.Tag(name="url")
            url.append(uri)
            file.append("\n")
            file.append(url)
        file.append("\n")
        return file

예제 #17

0

파일 보기

    def __init__(self, doc, parent, attr):
        self.doc    = doc
        self.parent = parent
        self.attr   = attr
        self.tag    = bs4.Tag(parser = self.doc, name = 'attr')
        Node.__init__(self, doc)

        self._specified = False
        self._value     = self.getValue()

예제 #18

0

파일 보기

def fixMetalink(meta4Text: str) -> "bs4.BeautifulSoup":
    """This function is licensed under Unlicense license"""

    meta4XML = bs4.BeautifulSoup(meta4Text, "xml")
    fEl = meta4XML.select_one("file")
    urisEls = list(fEl.select("url"))
    for u in urisEls:
        u.string = fixHTTPS(u.string)
    if not fEl.select("metaurl[mediatype=torrent]"):
        t = bs4.Tag(name="metaurl")
        t.attrs["mediatype"] = "torrent"
        t.string = uris["torrent"]
        urisEls[0].insert_before(t)

    magnetUri = ourGet(uris["magnet"]).text.strip()

    t = bs4.Tag(name="url")
    t.attrs["priority"] = "0"
    t.string = magnetUri
    urisEls[0].insert_before(t)
    return meta4XML

예제 #19

0

파일 보기

파일: Document.py 프로젝트: Thorsten-Sick/thug

    def createElement(self, tagname, tagvalue = None):
        from .DOMImplementation import DOMImplementation

        if log.ThugOpts.features_logging:
            log.ThugLogging.Features.increase_createelement_count()

        # Internet Explorer 8 and below also support the syntax
        # document.createElement('<P>')
        if log.ThugOpts.Personality.isIE() and log.ThugOpts.Personality.browserMajorVersion < 9:
            if tagname.startswith('<') and '>' in tagname:
                tagname = tagname[1:].split('>')[0]

        return DOMImplementation.createHTMLElement(self, BeautifulSoup.Tag(parser = self.doc, name = tagname))

예제 #20

0

파일 보기

파일: Document.py 프로젝트: Prithvirajbilla/thug

    def createElement(self, tagname, tagvalue = None):
        from DOMImplementation import DOMImplementation

        # Internet Explorer 8 and below also support the syntax
        # document.createElement('<P>')
        if log.ThugOpts.Personality.isIE() and log.ThugOpts.Personality.browserVersion < '9.0':
            if tagname.startswith('<') and '>' in tagname:
                tagname = tagname[1:].split('>')[0]

        element = DOMImplementation.createHTMLElement(self, BeautifulSoup.Tag(parser = self.doc, name = tagname))
        if self.onCreateElement:
            self.onCreateElement(element)
        
        return element

예제 #21

0

파일 보기

def embedImages():
    if debug:
        shutil.copy('output.html', 'outputOLD.html')
    file1 = open('output.html', 'rb')
    con = file1.read()
    file1.close()
    try:
        bs = BeautifulSoup.BeautifulSoup(con.decode('utf8'), features="lxml")
    except UnicodeDecodeError:
        bs = BeautifulSoup.BeautifulSoup(con,
                                         fromEncoding='windows-1252',
                                         features="lxml")
    if bs.find('meta', {'http-equiv': 'Content-Type'}) is None:
        bs.find('head').insert(
            1,
            BeautifulSoup.Tag(parser=bs,
                              name='meta',
                              attrs={
                                  'http-equiv': 'Content-Type',
                                  'content': 'text/html; charset=utf-8'
                              }))
    tagsTemp = bs.findAll('img')
    tags = []
    for x in tagsTemp:
        h = x.get('src')
        if h != 'None':
            if len(h) > 3:
                if h[0:4] == 'cid:':
                    tags.append(x)
    for x in tags:
        src = x['src']
        with open(src[4:], 'rb') as emb:
            stream = emb.read()
        data = 'data:image;base64,' + base64.b64encode(stream).decode('utf8')
        x['src'] = data
        if not debug:
            os.remove(src[4:])
    con = bs.prettify()
    file1 = open('output.html', 'wb')
    file1.write(con.encode('utf8'))
    file1.close()

예제 #22

0

파일 보기

def append_to(parent, tag, **kwargs):
    """
    Append an element to the supplied parent.

    :param parent: Parent to append to.
    :param tag: Tag to create.
    :param kwargs: Tag kwargs.
    :return: New element.
    """
    if hasattr(parent, "soup"):
        soup = parent.soup
    else:
        soup = parent.find_parent("html")

    # Create Tag explicitly instead of using new_tag, otherwise attribute "name" leads to clash with tag-name in bs4
    new_tag = bs4.Tag(builder=soup.builder, name=tag, attrs=kwargs)

    new_tag.soup = soup

    parent.append(new_tag)

    return new_tag

예제 #23

0

파일 보기

    def createElement(self, tagname, tagvalue=None):
        # zomo
        import re
        match = re.search('iframe', str(tagname).lower())
        if match:
            log.ThugLogging.add_behavior_warn(
                "[iframe injection: createElement] %s" % str(tagname))

        from DOMImplementation import DOMImplementation

        # Internet Explorer 8 and below also support the syntax
        # document.createElement('<P>')
        if log.ThugOpts.Personality.isIE(
        ) and log.ThugOpts.Personality.browserVersion < '9.0':
            if tagname.startswith('<') and '>' in tagname:
                tagname = tagname[1:].split('>')[0]

        element = DOMImplementation.createHTMLElement(
            self, BeautifulSoup.Tag(parser=self.doc, name=tagname))
        if self.onCreateElement:
            self.onCreateElement(element)

        return element

예제 #24

0

파일 보기

    def insertRow(self, index=None):
        # Insert a new empty row in the table. The new row is inserted immediately before
        # and in the same section as the current indexth row in the table. If index is -1
        # or equal to the number of rows, the new row is appended. In addition, when the
        # table is empty the row is inserted into a TBODY which is created and inserted
        # into the table.

        # `index' specifies the position of the row to insert (starts at 0). The value of
        # -1 can also be used; which result in that the new row will be inserted at the
        # last position. This parameter is required in Firefox and Opera, but optional in
        # Internet Explorer, Chrome and Safari. If this parameter is omitted, insertRow()
        # inserts a new row at the last position in IE and at the first position in Chrome
        # and Safari.
        if index is None:
            if log.ThugOpts.Personality.isIE():
                index = -1
            if log.ThugOpts.Personality.isChrome(
            ) or log.ThugOpts.Personality.isSafari():
                index = 0

        row = HTMLTableRowElement(self.doc, bs4.Tag(self.doc, name='tr'))
        self.rows.nodes.insert(index, row)
        return row

예제 #25

0

파일 보기

 def __init__(self, doc):
     self.tag = BeautifulSoup.Tag(parser = doc, name = 'documentfragment')
     Node.__init__(self, doc)
     self.__init_personality()

예제 #26

0

파일 보기

파일: _dict.py 프로젝트: mozii/pyglossary

def format_clean_content(title, body, BeautifulSoup):
    # heavily integrated with output of dsl reader plugin!
    # and with xdxf also.
    """
    :param title: str | None
    """

    # class="sec" => d:priority="2"
    # style="color:steelblue" => class="ex"
    # class="p" style="color:green" => class="p"
    # style="color:green" => class="c"
    # style="margin-left:{}em" => class="m{}"
    # <s> => <del>

    # xhtml is strict
    if BeautifulSoup:
        soup = BeautifulSoup.BeautifulSoup(body, "lxml", from_encoding='utf-8')
        # difference between 'lxml' and 'html.parser'
        if soup.body:
            soup = soup.body

        for tag in soup(class_='sec'):
            tag['class'].remove('sec')
            if not tag['class']:
                del tag['class']
            tag['d:priority'] = "2"
        for tag in soup(lambda x: 'color:steelblue' in x.get('style', '')):
            remove_style(tag, 'color:steelblue')
            if 'ex' not in tag.get('class', []):
                tag['class'] = tag.get('class', []) + ['ex']
        for tag in soup(is_green):
            remove_style(tag, 'color:green')
            if 'p' not in tag.get('class', ''):
                tag['class'] = tag.get('class', []) + ['c']
        for tag in soup(True):
            if 'style' in tag.attrs:
                m = margin_re.search(tag['style'])
                if m:
                    remove_style(tag, m.group(0))
                    tag['class'] = tag.get('class', []) + ['m' + m.group(1)]
        for tag in soup.select('[href]'):
            href = tag['href']
            if not (href.startswith('http:') or href.startswith('https:')):
                tag['href'] = 'x-dictionary:d:%s' % href
        for tag in soup('u'):
            tag.name = 'span'
            tag['class'] = tag.get('class', []) + ['u']
        for tag in soup('s'):
            tag.name = 'del'

        if title:
            h1 = BeautifulSoup.Tag(name='h1')
            h1.string = title
            soup.insert(0, h1)
        # hence the name BeautifulSoup
        content = toStr(soup.encode_contents())
    else:
        # somewhat analogue to what BeautifulSoup suppose to do
        body = em0_9_re.sub(em0_9_sub, body)
        body = em0_9_ex_re.sub(em0_9_ex_sub, body)
        body = href_re.sub(href_sub, body)

        body = body \
            .replace('<i style="color:green">', '<i class="c">') \
            .replace('<i class="p" style="color:green">', '<i class="p">') \
            .replace('<span class="ex" style="color:steelblue">', '<span class="ex">') \
            .replace('<span class="sec ex" style="color:steelblue">', '<span class="sec ex">') \
            .replace('<u>', '<span class="u">').replace('</u>', '</span>') \
            .replace('<s>', '<del>').replace('</s>', '</del>')

        # nice header to display
        content = '<h1>%s</h1>%s' % (title, body) if title else body
        content = close_tag.sub('<\g<1> />', content)
        content = img_tag.sub('<img \g<1>/>', content)
    content = content.replace('&nbsp;', '&#160;')
    content = nonprintable.sub('', content)
    return content

예제 #27

0

파일 보기

 def createElement(self, tagname, tagvalue=None):
     return DOMImplementation.createHTMLElement(
         self, bs4.Tag(parser=self.doc, name=tagname))

예제 #28

0

파일 보기

파일: markup.py 프로젝트: wucheng139/dl_coursera

    def to_html(self, *, assets):
        if self._html is not None:
            return self._html

        asset_by_id = {_['id']: _ for _ in assets}

        def _assetName(id_):
            return asset_by_id[id_]['name']

        html = bs4.BeautifulSoup('', 'lxml')
        d = {}

        def _add(e0, e1):
            parent1 = html
            _e = e0
            while _e is not None:
                if id(_e) in d:
                    parent1 = d[id(_e)]
                    break
                _e = _e.parent

            if (parent1 is html) and (not _is_tag(e0)):
                return

            if _is_tag(e0):
                d[id(e0)] = e1
            parent1.append(e1)

        tr = Traversal(self._root)
        for e0 in tr:
            if isinstance(e0, bs4.NavigableString):
                _li = str(e0).split('$$')
                hasMath = False
                for _ in _li:
                    if not hasMath:
                        _add(e0, _)
                    else:
                        _span = bs4.Tag(name='span')
                        _span['hasMath'] = 'true'
                        _span.append(_)
                        _add(e0, _span)

                    hasMath = not hasMath

                continue

            if not _is_tag(e0):
                continue

            if e0.name == 'asset':
                assert _has_no_child(e0)

                e1 = bs4.Tag(name='p')
                e1['class'] = 'asset'
                e1.append(_assetName(e0['id']))

            elif e0.name == 'img':
                assert _has_no_child(e0)

                e1 = bs4.Tag(name='img')
                e1['src'] = e1['alt'] = _assetName(e0['assetId'])
                e1['src'] = quote(e1['src'])

            elif e0.name == 'heading':
                e1 = bs4.Tag(name='h%d' % int(e0['level']))

            elif e0.name == 'text':
                e1 = bs4.Tag(name='p')

            elif e0.name == 'list':
                bulletType = e0['bulletType']
                if bulletType == 'numbers':
                    e1 = bs4.Tag(name='ol')
                    e1['type'] = '1'
                elif bulletType == 'bullets':
                    e1 = bs4.Tag(name='ul')
                else:
                    e1 = bs4.Tag(name='ul')
                    logging.warning('[CML] unknown bulletType=%s' % bulletType)

            elif e0.name == 'a':
                e1 = bs4.Tag(name='a')
                e1['href'] = e0['href']
                if e0.get('refid'):
                    e1['refid'] = e0['refid']

            elif e0.name == 'code':
                e1 = bs4.Tag(name='pre')
                e1.append(copy.copy(e0))

                tr.skip_children()

            elif e0.name in [
                    'li', 'strong', 'em', 'u', 'table', 'tr', 'td', 'th',
                    'sup', 'sub'
            ]:
                e1 = bs4.Tag(name=e0.name)

            elif e0.name in ['co-content']:
                continue

            else:
                logging.warning('[CML] unknown e0.name=%s\n%s' % (e0.name, e0))
                continue

            _add(e0, e1)

        self._html = str(html)
        return self._html

예제 #29

0

파일 보기

파일: c_plus_plus_reference.py 프로젝트: Natsurii/nicabot-monkee

    async def get_information(self, ctx, href):
        """
        Gets information for the given search result.
        """
        url = base_cppr + href

        async with self.acquire_http_session() as conn:
            with algorithms.TimeIt() as timer:
                async with conn.get(url) as resp:
                    self.logger.info("GET %s", url)
                    resp.raise_for_status()

                    # Make soup.
                    bs = bs4.BeautifulSoup(await resp.text(), features="html.parser")

        await ctx.send(f"Response from server took {timer.time_taken * 1_000:,.2f}ms", delete_after=3)

        header = bs.find(name="tr", attrs={"class": "t-dsc-header"})
        if header:
            header = header.text
        else:
            header = ""

        taster_tbl: bs4.Tag = bs.find(name="table", attrs={"class": "t-dcl-begin"})

        if taster_tbl:
            tasters = taster_tbl.find_all(name="span", attrs={"class": lambda c: c is not None and "mw-geshi" in c})

            if tasters:
                # Fixes some formatting
                for i, taster in enumerate(tasters):
                    taster = taster.text.split("\n")
                    taster = "\n".join(t.rstrip() for t in taster)
                    taster = taster.replace("\n\n", "\n")
                    tasters[i] = taster

            # Remove tasters from DOM
            taster_tbl.replace_with(bs4.Tag(name="empty"))
        else:
            tasters = []

        h1 = bs.find(name="h1").text

        # Get the description
        desc = bs.find(name="div", attrs={"id": "mw-content-text"})

        if desc:
            # first_par_node = desc.find(name='p')
            # description = first_par_node.text + '\n'
            # sibs = first_par_node.find_next_siblings()
            # for sib in sibs:
            #    description += sib.text + '\n'
            description = "\n".join(
                p.text
                for p in desc.find_all(name="p")
                if not p.text.strip().endswith(":")
                and not p.text.strip().startswith("(")
                and not p.text.strip().endswith(")")
            )
        else:
            description = ""

        return url, h1, tasters, header, description

예제 #30

0

파일 보기

def format_clean_content(title, body, BeautifulSoup):
    # heavily integrated with output of dsl reader plugin!
    # and with xdxf also.
    """
	:param title: str | None
	"""

    # class="sec" => d:priority="2"
    # style="color:steelblue" => class="ex"
    # class="p" style="color:green" => class="p"
    # style="color:green" => class="c"
    # style="margin-left:{}em" => class="m{}"
    # <s> => <del>

    # xhtml is strict
    if BeautifulSoup:
        soup = BeautifulSoup.BeautifulSoup(body, "lxml", from_encoding="utf-8")
        # difference between "lxml" and "html.parser"
        if soup.body:
            soup = soup.body

        for tag in soup(class_="sec"):
            tag["class"].remove("sec")
            if not tag["class"]:
                del tag["class"]
            tag["d:priority"] = "2"
        for tag in soup(lambda x: "color:steelblue" in x.get("style", "")):
            remove_style(tag, "color:steelblue")
            if "ex" not in tag.get("class", []):
                tag["class"] = tag.get("class", []) + ["ex"]
        for tag in soup(is_green):
            remove_style(tag, "color:green")
            if "p" not in tag.get("class", ""):
                tag["class"] = tag.get("class", []) + ["c"]
        for tag in soup(True):
            if "style" in tag.attrs:
                m = margin_re.search(tag["style"])
                if m:
                    remove_style(tag, m.group(0))
                    tag["class"] = tag.get("class", []) + ["m" + m.group(1)]
        for tag in soup.select("[href]"):
            href = tag["href"]
            if href.startswith("bword://"):
                href = href[len("bword://"):]
            if not (href.startswith("http:") or href.startswith("https:")):
                tag["href"] = "x-dictionary:d:%s" % href
        for tag in soup("u"):
            tag.name = "span"
            tag["class"] = tag.get("class", []) + ["u"]
        for tag in soup("s"):
            tag.name = "del"

        if title:
            h1 = BeautifulSoup.Tag(name="h1")
            h1.string = title
            soup.insert(0, h1)
        # hence the name BeautifulSoup
        content = toStr(soup.encode_contents())
    else:
        # somewhat analogue to what BeautifulSoup suppose to do
        body = em0_9_re.sub(em0_9_sub, body)
        body = em0_9_ex_re.sub(em0_9_ex_sub, body)
        body = href_re.sub(href_sub, body)

        body = body \
         .replace('<i style="color:green">', '<i class="c">') \
         .replace('<i class="p" style="color:green">', '<i class="p">') \
         .replace('<span class="ex" style="color:steelblue">', '<span class="ex">') \
         .replace('<span class="sec ex" style="color:steelblue">', '<span class="sec ex">') \
         .replace('<u>', '<span class="u">').replace('</u>', '</span>') \
         .replace('<s>', '<del>').replace('</s>', '</del>')

        # nice header to display
        content = "<h1>%s</h1>%s" % (title, body) if title else body
        content = close_tag.sub("<\g<1> />", content)
        content = img_tag.sub("<img \g<1>/>", content)
    content = content.replace("&nbsp;", "&#160;")
    content = nonprintable.sub("", content)
    return content