Esempi in Python per BeautifulSoup.Tag

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: third_party

Classe/tipologia: BeautifulSoup

Metodo/funzione: Tag

Esempi su hotexamples.com: 6

BeautifulSoup.Tag in Python: 6 esempi trovati. Questi sono i migliori esempi reali in Python per third_party.BeautifulSoup.Tag, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Tag(6)

BeautifulSoup(4)

NavigableString(2)

SoupStrainer(1)

Esempio n. 1

Mostra file

File: clean.py Progetto: Type-of-Read/readability-api

def _Munge(soup, tag, url):
    """Given a string of HTML content, munge it to be more pleasing."""
    # In certain failure cases, we'll still get a string.  Just use it.
    if isinstance(tag, basestring):
        return tag

    _MungeStripSiteSpecific(tag, url)
    _MungeStripBrsAfterPs(tag)
    _MungeStripRules(tag)
    _MungeStripEmpties(tag)
    tag = _MungeStripRootContainers(tag)
    _MungeStripLowScored(tag)
    _MungeStripAttrs(tag)

    _FixUrls(tag, url)
    _MungeImages(tag)
    _MungeHeaderDowngrade(tag)
    _MungeHyphenate(tag)
    _MungeNoscript(tag)
    tag = _MungeTransformEmbeds(soup, tag)

    # Serialize the tag, and apply full justification.
    if isinstance(tag, BeautifulSoup.BeautifulStoneSoup):
        # Wrap in a div, to have a tag to justify, if necessary.
        wrap = BeautifulSoup.Tag(soup, 'div')
        wrap.insert(0, tag)
        tag = wrap
    tag['style'] = 'text-align: justify;'

    return unicode(tag)

Esempio n. 2

Mostra file

def _TransformBrsToParagraphsInner(soup, tag):
  next = tag
  while True:
    next = next.nextSibling
    if not next:
      return
    if isinstance(next, BeautifulSoup.Tag):
      if next.name == 'br':
        break
      else:
        return
    elif isinstance(next, BeautifulSoup.NavigableString):
      if not unicode(next).strip():
        continue
      else:
        return

  contents = []
  prev = tag
  while True:
    prev = prev.previousSibling
    if not prev: break
    if hasattr(prev, 'name') and prev.name in BR_TO_P_STOP_TAGS: break
    contents.insert(0, prev)

  newp = BeautifulSoup.Tag(soup, 'p')
  for i, newtag in enumerate(contents):
    newp.insert(i, newtag)
  next.extract()
  tag.replaceWith(newp)

Esempio n. 3

Mostra file

File: clean.py Progetto: Type-of-Read/readability-api

def _MungeTransformEmbeds(soup, root_tag):
    for tag in util.FindEmbeds(root_tag):
        try:
            w, h = util.TagSize(root_tag)
        except TypeError:
            w = 600
            h = 400
        link = BeautifulSoup.Tag(soup, 'a')
        link['href'] = 'data:text/html;base64,' + base64.b64encode(
            '<body style="margin:0;">%s</body>' % unicode(tag))
        link['rel'] = 'embedded_media'
        link['embed_width'] = w
        link['embed_height'] = h
        img = BeautifulSoup.Tag(soup, 'img')
        img['src'] = 'http://readability-api.appspot.com/embedded_media.png'
        img['width'] = '128'
        img['height'] = '128'
        link.insert(0, img)
        if tag == root_tag:
            return link
        tag.replaceWith(link)
    return root_tag

Esempio n. 4

Mostra file

File: util.py Progetto: Type-of-Read/readability-api

def OEmbedFixup(soup):
    oembed_links = soup.findAll(
        'a', {'onclick': re.compile(r'^oEmbedManagerVideoLoader')})
    for oembed_link in oembed_links:
        cont = oembed_link.parent
        embed = cont.find('iframe')
        if not embed:
            ta = cont.find('textarea')
            if not ta: return
            s = BeautifulSoup.BeautifulSoup(ta.text)
            embed = s.find('iframe')
        embed['src'] = re.sub(r'\?.*', '', embed['src'])
        div = BeautifulSoup.Tag(soup, 'div')
        div.insert(0, embed)
        cont.replaceWith(div)

Esempio n. 5

Mostra file

File: util.py Progetto: Type-of-Read/readability-api

def SwfObjectFixup(soup):
    # SWFObject 1 style
    script_txts = soup.findAll('script_txt',
                               text=re.compile(r'\bnew SWFObject\b'))
    for script_txt in script_txts:
        m = re.search(r'new\s+SWFObject.*?\((.*)\)', str(script_txt))
        src, name, width, height, _, bgcolor = [
            x for _, x in re.findall(r"""(['"])(.*?)\1""", m.group(1))
        ]
        embed = BeautifulSoup.Tag(soup, 'embed')
        embed['src'] = src
        embed['name'] = name
        embed['width'] = width
        embed['height'] = height
        embed['bgcolor'] = bgcolor
        for m in re.findall(
                r"""\.\s*addParam\s*\(\s*(['"])(.*)\1\s*,\s*(['"])(.*)\3\s*\)""",
                str(script_txt)):
            embed[m[1]] = m[3]
        script_txt.parent.replaceWith(embed)

Esempio n. 6

Mostra file

def ExtractFromHtml(url, html):
    """Given a string of HTML, remove nasty bits, score and pick bit to keep."""
    if re.search(r'^http://(www\.)?reddit\.com/.*/comments/', url, re.I):
        strainer = BeautifulSoup.SoupStrainer(
            attrs={'class': re.compile(r'thing.*link|usertext border')})
        soup = BeautifulSoup.BeautifulSoup(html, parseOnlyThese=strainer)
        body = soup.find(attrs={'class': re.compile(r'\busertext-body\b')})
        if not body:
            body = soup.find('a', attrs={'class': re.compile(r'\btitle\b')})
            body = body and body.text or soup
        return soup, body
    elif re.search(r'^http://(www\.)?xkcd\.com/\d+', url, re.I):
        soup = BeautifulSoup.BeautifulSoup(html)
        img = soup.find(alt=True, title=True)
        cont = img.parent.parent
        for tag in cont.findAll(('br', 'div')):
            util.Strip(tag)
        return soup, cont
    elif re.search(r'^http://groups\.google\.com/', url, re.I):
        strainer = BeautifulSoup.SoupStrainer(attrs={'class': 'maincontbox'})
        soup = BeautifulSoup.BeautifulSoup(html, parseOnlyThese=strainer)
        return _ExtractFromHtmlGeneric(url, unicode(soup))
    elif re.search(r'^http://(www\.)?nytimes\.com/', url, re.I):
        soup = BeautifulSoup.BeautifulSoup(html)
        bodies = soup.findAll(attrs={'class': 'articleBody'})
        if bodies:
            # Put the first "article body" contents into the second -- for some
            # reason NYT splits the lead-in text into its own "body".
            while bodies[0].contents:
                bodies[1].insert(0, bodies[0].contents[-1])
        return _ExtractFromHtmlGeneric(url, unicode(soup))
    elif re.search(r'\.txt(\?|$)', url, re.I):
        soup = BeautifulSoup.BeautifulSoup()
        pre = BeautifulSoup.Tag(soup, 'pre')
        pre.insert(0, BeautifulSoup.NavigableString(html))
        soup.insert(0, pre)
        return soup, soup
    else:
        return _ExtractFromHtmlGeneric(url, html)