Python BeautifulSoup.appendの例、BeautifulSoup.BeautifulSoup.append Pythonの例

コード例 #1

0

ファイルを表示

def generate_table(summary):
    soup = BeautifulSoup()
    new_tag_table = Tag(soup, "table")
    new_tag_table["border"] = 1
    new_tag_table["cellspacing"] = 0
    new_tag_table["cellpadding"] = 0
    new_tag_table["bordercolordark"] = "#000000"
    new_tag_table["cellspacing"] = "#ffffff"
    soup.append(new_tag_table)
    new_Tag_tr = Tag(soup, "tr")
    new_Tag_tr["bgcolor"] = "#0072E3"
    new_tag_table.append(new_Tag_tr)
    for i in ["TestSuite", "Passed", "Failed", "Total"]:
        new_Tag_td = Tag(soup, "td")
        new_Tag_td.string = str(i)
        new_Tag_tr.append(new_Tag_td)
    for i in summary:
        new_Tag_tr = Tag(soup, "tr")
        new_tag_table.append(new_Tag_tr)
        for j in i:
            new_Tag_td = Tag(soup, "td")
            new_Tag_td.string = str(j)
            new_Tag_tr.append(new_Tag_td)
    print str(soup.prettify())
    return str(soup.prettify())

コード例 #2

0

ファイルを表示

def get_slides(args):
    contents = get_file_contents(args.file)
    soup = BeautifulSoup(markdown(contents))

    hsoup = BeautifulSoup()
    html = Tag(hsoup, 'html')
    hsoup.append(html)

    head = Tag(hsoup, 'head')
    title = Tag(hsoup, 'title')
    title.setString(args.file)
    head.append(title)

    link = Tag(hsoup, 'link')
    link['rel'] = 'stylesheet'
    link['type'] = 'text/css'
    if args.offline:
        link['href'] = 'default.css'
    else:
        link[
            'href'] = 'http://gdg-xian.github.io/html5slides-markdown/themes/default.css'
    head.append(link)

    script = Tag(hsoup, 'script')
    if args.offline:
        script['src'] = 'html5slides.js'
    else:
        script[
            'src'] = 'http://gdg-xian.github.io/html5slides-markdown/javascripts/html5slides.js'
    head.append(script)
    html.append(head)

    body = Tag(hsoup, 'body')
    body['style'] = 'display:none'
    section = Tag(hsoup, 'section')
    section['class'] = 'slides layout-regular template-default'
    body.append(section)
    elements = []
    elements.append(soup.first())
    elements.extend(soup.first().findNextSiblings())
    article = Tag(hsoup, 'article')
    section.append(article)
    for element in elements:
        if element.name == 'hr':
            article = Tag(hsoup, 'article')
            section.append(article)
        else:
            article.append(element)

    html.append(body)

    return prettify(html)

コード例 #3

0

ファイルを表示

ファイル: functions.py プロジェクト: GDG-Xian/html5slides-markdown

def get_slides(args):
    contents = get_file_contents(args.file)
    soup = BeautifulSoup(markdown(contents))

    hsoup = BeautifulSoup()
    html = Tag(hsoup, 'html')
    hsoup.append(html)

    head = Tag(hsoup, 'head')
    title = Tag(hsoup, 'title')
    title.setString(args.file)
    head.append(title)

    link = Tag(hsoup, 'link')
    link['rel'] = 'stylesheet'
    link['type'] = 'text/css'
    if args.offline:
        link['href'] = 'default.css'
    else:
        link['href'] = 'http://gdg-xian.github.io/html5slides-markdown/themes/default.css'
    head.append(link)

    script = Tag(hsoup, 'script')
    if args.offline:
        script['src'] = 'html5slides.js'
    else:
        script['src'] = 'http://gdg-xian.github.io/html5slides-markdown/javascripts/html5slides.js'
    head.append(script)
    html.append(head)

    body = Tag(hsoup, 'body')
    body['style'] = 'display:none'
    section = Tag(hsoup, 'section')
    section['class'] = 'slides layout-regular template-default'
    body.append(section)
    elements = []
    elements.append(soup.first())
    elements.extend(soup.first().findNextSiblings())
    article = Tag(hsoup, 'article')
    section.append(article)
    for element in elements:
        if element.name == 'hr':
            article = Tag(hsoup, 'article')
            section.append(article)
        else:
            article.append(element)

    html.append(body)

    return prettify(html)

コード例 #4

0

ファイルを表示

ファイル: newsletter.py プロジェクト: Scopart/emencia-django-newsletter

def body_insertion(content, insertion, end=False):
    """Insert an HTML content into the body HTML node"""
    insertion = BeautifulSoup(insertion)
    soup = BeautifulSoup(content)

    if soup.body and end:
        soup.body.append(insertion)
    elif soup.body:
        soup.body.insert(0, insertion)
    elif not soup.body and end:
        soup.append(insertion)
    elif not soup.body:
        soup.insert(0, insertion)

    if USE_PRETTIFY:
        return soup.prettify()
    else:
        return soup.renderContents()

コード例 #5

0

ファイルを表示

ファイル: newsletter.py プロジェクト: vilos/django-maja-newsletter

def body_insertion(content, insertion, end=False):
    """Insert an HTML content into the body HTML node"""
    if not content.startswith('<body'):
        content = u'<body>%s</body>' % smart_text(content)
    soup = BeautifulSoup(content)
    insertion = BeautifulSoup(insertion)

    if end:
        soup.append(insertion)
    else:
        soup.body.insert(0, insertion)

    if USE_PRETTIFY:
        text = soup.prettify()
    else:
        text = soup.renderContents()
    text = smart_text(text)

    if USE_PREMAILER:
        site = Site.objects.get_current()
        return premailer.transform(smart_text(text), base_url='http://%s' % site.domain)
    else:
        return text

コード例 #6

0

ファイルを表示

ファイル: default.py プロジェクト: A5aFk/script.openvpn

def read_connections():
    if _connections:
        f = open(_connections, 'r')
        connections = BeautifulSoup(f.read())
        f.close()
    else:
        connections = []
        if _settings['vpn1id'] and len(_settings['vpn1id']) > 0:
            vpn = {'id': _settings['vpn1id'],
                   'host': _settings['vpn1host'],
                   'port': _settings['vpn1port'],
                   'proto': proto_enum_to_string(_settings['vpn1proto']),
                   'cipher': _settings['vpn1cipher'],
                   'delay': _settings['vpn1delay']}
            connections.append(vpn)
        if _settings['vpn2id'] and len(_settings['vpn2id']) > 0:
            vpn = {'id': _settings['vpn2id'],
                   'host': _settings['vpn2host'],
                   'port': _settings['vpn2port'],
                   'proto': proto_enum_to_string(_settings['vpn2proto']),
                   'cipher': _settings['vpn2cipher'],
                   'delay': _settings['vpn2delay']}
            connections.append(vpn)
    return connections

コード例 #7

0

ファイルを表示

ファイル: wikisnip.py プロジェクト: anandology/sandbox

def wikisnip(url):
    html = wget(url)
    soup = BeautifulSoup(html)

    div = soup.find('div', {'id': 'bodyContent'})

    snip = BeautifulSoup('')

    for node in div.childGenerator():
        if (isinstance(node, basestring) or
            node.name.lower() in ["table", "script"] or
            node.get('id') in ["siteSub", "contentSub", "jump-to-nav"] or
            node.get('class') in ['dablink', 'toclimit-2']):
            continue

        if node.name.lower() == "h2":
            break
        snip.append(node)

    for a in snip.findAll('a'):
        if a.get('href'):
            a['href'] = urlparse.urljoin(url, a['href'])

    return snip

コード例 #8

0

ファイルを表示

ファイル: news-digest.py プロジェクト: rubenmartinm/news-digest

def crear_cuerpo_mail(tb):
    cuerpo_mail = BeautifulSoup()
    html = Tag(cuerpo_mail, "html")
    cuerpo_mail.append(html)

    cuerpo_mail.append("Enlaces:<br>\n")

    for fuente in sorted(tb):
        h3 = Tag(cuerpo_mail, "h3")
        cuerpo_mail.append(h3)
        h3.append(fuente)

        ul = Tag(cuerpo_mail, "ul")
        cuerpo_mail.append(ul)

        lista_titulos_fuente = tb[fuente]

        for i in lista_titulos_fuente:
            dict_enlace = i
            li = Tag(cuerpo_mail, "li")
            li.append(dict_enlace['titulo'].decode('utf-8')+" - <a href='"+dict_enlace['url'].decode('utf-8')+"'>link</a>")
	    cuerpo_mail.append(li)

    return str(cuerpo_mail)

コード例 #9

0

ファイルを表示

ファイル: deckjsparser.py プロジェクト: GDG-Xian/html4slides

    def parse(self):
        soup = BeautifulSoup(self.content)

        hsoup = BeautifulSoup()
        html = Tag(hsoup, 'html')
        hsoup.append(html)

        head = Tag(hsoup, 'head')
        title = Tag(hsoup, 'title')
        title.setString(self.title)
        head.append(title)

        link1 = Tag(hsoup, 'link')
        link1['rel'] = 'stylesheet'
        link1['type'] = 'text/css'
        link1['href'] = 'http://imakewebthings.com/deck.js/core/deck.core.css'
        head.append(link1)

        link2 = Tag(hsoup, 'link')
        link2['rel'] = 'stylesheet'
        link2['type'] = 'text/css'
        link2['href'] = 'http://imakewebthings.com/deck.js/themes/style/swiss.css'
        head.append(link2)

        link3 = Tag(hsoup, 'link')
        link3['rel'] = 'stylesheet'
        link3['type'] = 'text/css'
        link3['href'] = 'http://yandex.st/highlightjs/7.3/styles/monokai_sublime.min.css'
        head.append(link3)

        link3 = Tag(hsoup, 'link')
        link3['rel'] = 'stylesheet'
        link3['type'] = 'text/css'
        link3['href'] = 'http://imakewebthings.com/deck.js/themes/transition/fade.css'
        head.append(link3)

        script1 = Tag(hsoup, 'script')
        script1['src'] = 'http://imakewebthings.com/deck.js/jquery-1.7.min.js'
        head.append(script1)

        script2 = Tag(hsoup, 'script')
        script2['src'] = 'http://imakewebthings.com/deck.js/core/deck.core.js'
        head.append(script2)

        script3 = Tag(hsoup, 'script')
        script3['src'] = 'http://yandex.st/highlightjs/7.3/highlight.min.js'
        head.append(script3)

        script3 = Tag(hsoup, 'script')
        script3['type'] = 'text/javascript'
        script3.setString(DECK_JS)
        head.append(script3)

        html.append(head)

        body = Tag(hsoup, 'body')
        body['class'] = 'deck-container'
        elements = []
        elements.append(soup.first())
        elements.extend(soup.first().findNextSiblings())
        section = Tag(hsoup, 'section')
        section['class'] = 'slide'
        body.append(section)
        for element in elements:
            if element.name == 'hr':
                section = Tag(hsoup, 'section')
                section['class'] = 'slide'
                body.append(section)
            else:
                section.append(element)

        html.append(body)

        self.html_content = html

コード例 #10

0

ファイルを表示

ファイル: PTB-wikify.py プロジェクト: BackupTheBerlios/osxptb-svn

def mexhelpextract(mexnames):
    #print 'processing mex files: ' + mexnames.__repr__()
    from ConfigParser import RawConfigParser as ConfigParser, Error as error
    for mexname in mexnames:
        # ConfigParser for the three elements per subfunctions written to tmpdir
        # [SubFunction]
        # usage: 'xyz'
        # help: 'xyz'
        # seealso: 'xyz'
        config = ConfigParser({'usage': [], 'help': [], 'seealso': []})
        # assemble command line for matlab
        matlabcmd = 'addpath(\'%s\');%s(\'%s\',\'%s\'); exit' % \
            (_tmpdir, \
             os.path.splitext(os.path.basename(_mexscript))[0], \
             mexname, \
             _tmpdir)
        cmd = 'matlab -nojvm -nodisplay -r "%s" > /dev/null' % matlabcmd
        # and execute matlab w/ the temporary script we wrote earlier
        try:
            print 'running MATLAB for %s in %s' % (mexname, _tmpdir)
            stdin, stderr = os.popen4(cmd)
            print stderr.read()
            stdin.close()
            stderr.close()
        except:
            print 'could not dump help for %s into %s' % (mexname, _tmpdir)

        cfgfile = config.read(os.path.join(_tmpdir, mexname))
        if cfgfile == []:
            print "skipping " + mexname + " (no output)"
            continue
        subfunctions = config.sections()
        print 'processing subfunctions: ' + subfunctions.__repr__()
        for subfunction in subfunctions:
            # read in the strings for this subfunction
            usage = config.get(subfunction, 'usage')
            help = config.get(subfunction, 'help')
            seealso = config.get(subfunction, 'seealso')

            headline = '===[[' + subfunction + ' ' + mexname + '(\'' + subfunction + '\')]]===\n'
            breadcrumb = "==[[Psychtoolbox]] &#8250; [[" \
                                + mexname + "]].{mex*,dll} subfunction==\n\n"

            # scrub the text for main text only
            body = beackern(help)

            docstring = '' \
                    + '%%(matlab;Usage)' \
                    + usage \
                    + '%%\n' \
                    + body \
                    + '\n\n'
            if seealso:
                docstring = docstring + '<<=====See also:=====\n' + seealso + '<<'

            text =  '""' + headline \
                    + breadcrumb \
                    + docstring + '""'

            # retrieve old body text, to update or concatenate with synonymous subfunctions
            #
            # browse the page
            title = re.sub("[^\w]|_", "", subfunction)
            try:
                resp = mech.open(baseurl + title + "/edit")
            except HTTPError, e:
                sys.exit(
                    "retrieving old text during posting of this mex function failed: %d: %s"
                    % (e.code, e.msg))
            # get text from the edit form
            mech.select_form(nr=1)
            try:
                oldbody = mech["body"]
            except:
                print 'No id="body" form. Figure this out first. cf. page text above.'
                for form in mech.forms():
                    print form
                sys.exit(
                    "retrieving old body text failed while processing page: " +
                    baseurl + title + '/edit')

            # parse embedded structuring HTML tags in the wiki text
            soup = BeautifulSoup(oldbody)

            # check if the subfunction is already present, by CSS 'class' and 'id'
            subfct = soup.find('div', {'class': "subfct", 'id': mexname})
            if subfct:
                # replace the text of the container DIV
                subfct.contents[0].replaceWith(text)
            else:
                # contruct new DIV to hold the text
                subfctDIV = Tag(soup, "div")
                subfctDIV['class'] = 'subfct'
                subfctDIV['id'] = mexname
                subfctDIV.insert(0, NavigableString(text))

                # insert the new div
                soup.insert(len(soup), subfctDIV)

            # Now scoop the good well-formed divs out of the soup
            divs = soup('div', {'class': "subfct"})

            # and drop them into fresh yummy cheese soup
            cheesesoup = BeautifulSoup()

            # drop good divs into the soup, one by one
            for div in divs:
                # remove the unneeded style attribute, we finally
                # have this stuff defined in the ptbdocs.css now.
                del (div['style'])
                # escape the HTML tags for wiki parser
                cheesesoup.append(NavigableString('\n""'))
                cheesesoup.append(div)
                cheesesoup.append(NavigableString('""\n'))

            post(subfunction, cheesesoup.renderContents())

コード例 #11

0

ファイルを表示

ファイル: PTB-wikify.py プロジェクト: BackupTheBerlios/osxptb-svn

def mexhelpextract(mexnames):
    #print 'processing mex files: ' + mexnames.__repr__()
    from ConfigParser import RawConfigParser as ConfigParser, Error as error
    for mexname in mexnames:
        # ConfigParser for the three elements per subfunctions written to tmpdir
        # [SubFunction]
        # usage: 'xyz'
        # help: 'xyz'
        # seealso: 'xyz'
        config = ConfigParser({'usage':[], 'help':[], 'seealso':[]})
        # assemble command line for matlab
        matlabcmd = 'addpath(\'%s\');%s(\'%s\',\'%s\'); exit' % \
            (_tmpdir, \
             os.path.splitext(os.path.basename(_mexscript))[0], \
             mexname, \
             _tmpdir)
        cmd = 'matlab -nojvm -nodisplay -r "%s" > /dev/null' % matlabcmd
        # and execute matlab w/ the temporary script we wrote earlier
        try:
            print 'running MATLAB for %s in %s' % (mexname,_tmpdir)
            stdin, stderr = os.popen4(cmd)
            print stderr.read()
            stdin.close()
            stderr.close()
        except: print 'could not dump help for %s into %s' % (mexname,_tmpdir)

        cfgfile = config.read(os.path.join(_tmpdir,mexname))
        if cfgfile == []:
            print "skipping " + mexname + " (no output)"
            continue
        subfunctions = config.sections()
        print 'processing subfunctions: ' + subfunctions.__repr__()
        for subfunction in subfunctions:
            # read in the strings for this subfunction
            usage = config.get(subfunction,'usage')
            help = config.get(subfunction,'help')
            seealso = config.get(subfunction,'seealso')

            headline = '===[['+subfunction+' '+mexname+'(\''+subfunction+'\')]]===\n'
            breadcrumb = "==[[Psychtoolbox]] &#8250; [[" \
                                + mexname + "]].{mex*,dll} subfunction==\n\n"

            # scrub the text for main text only
            body = beackern(help)

            docstring = '' \
                    + '%%(matlab;Usage)' \
                    + usage \
                    + '%%\n' \
                    + body \
                    + '\n\n'
            if seealso:
                docstring = docstring + '<<=====See also:=====\n' + seealso + '<<'

            text =  '""' + headline \
                    + breadcrumb \
                    + docstring + '""'

            # retrieve old body text, to update or concatenate with synonymous subfunctions
            #
            # browse the page
            title = re.sub("[^\w]|_","",subfunction)
            try:
                resp = mech.open(baseurl+title+"/edit")
            except HTTPError, e:
                sys.exit("retrieving old text during posting of this mex function failed: %d: %s" % (e.code, e.msg))
            # get text from the edit form
            mech.select_form(nr=1)
            try:
                oldbody = mech["body"]
            except:
                print 'No id="body" form. Figure this out first. cf. page text above.'
                for form in mech.forms():
                        print form
                sys.exit("retrieving old body text failed while processing page: " + baseurl + title +'/edit')

            # parse embedded structuring HTML tags in the wiki text
            soup = BeautifulSoup(oldbody)

            # check if the subfunction is already present, by CSS 'class' and 'id'
            subfct = soup.find('div', {'class' : "subfct", 'id' : mexname})
            if subfct:
                # replace the text of the container DIV
                subfct.contents[0].replaceWith(text)
            else:
                # contruct new DIV to hold the text
                subfctDIV = Tag(soup, "div")
                subfctDIV['class'] = 'subfct'
                subfctDIV['id'] = mexname
                subfctDIV.insert(0,NavigableString(text))

                # insert the new div
                soup.insert(len(soup),subfctDIV)

            # Now scoop the good well-formed divs out of the soup
            divs = soup('div', {'class' : "subfct"})

            # and drop them into fresh yummy cheese soup
            cheesesoup = BeautifulSoup()

            # drop good divs into the soup, one by one
            for div in divs:
                # remove the unneeded style attribute, we finally
                # have this stuff defined in the ptbdocs.css now.
                del(div['style'])
                # escape the HTML tags for wiki parser
                cheesesoup.append(NavigableString('\n""'))
                cheesesoup.append(div)
                cheesesoup.append(NavigableString('""\n'))

            post(subfunction,cheesesoup.renderContents())

コード例 #12

0

ファイルを表示

ファイル: express.py プロジェクト: kascote/journalisted

def Extract( html, context ):
    """ handle express articles, including blogs """
    art = context

    # cheesiness - kill everything from comments onward..
    cullpats = [
        re.compile( "<a name=\"comments\">.*", re.DOTALL ),
        # when comments disabled, it just shows a message
        re.compile( r"""<img src="http://images[.]\w+[.]co[.]uk/img/comments/nocomments[.](gif|png)".*""", re.DOTALL )
    ]
    for cullpat in cullpats:
        html = cullpat.sub( "", html )

    # express claims to be iso-8859-1, but it seems to be windows-1252 really
    soup = BeautifulSoup( html, fromEncoding = 'windows-1252' )

    wrapdiv = soup.find( 'div', {'class':'articleWrapper'} )
    if wrapdiv is None:
        # for blogs(?)
        wrapdiv = soup.find( 'td', {'class':'contentcontainer'} )

    missing = soup.find( 'p', text=u"The article you are looking for does not exist.  It may have been deleted." )
    if missing:
        if 'title' in art:
            ukmedia.DBUG2( "IGNORE missing article '%s' (%s)\n" % (art['title'],art['srcurl'] ) )
        else:
            ukmedia.DBUG2( "IGNORE missing article (%s)\n" % ( art['srcurl'] ) )
        return None

    headline = wrapdiv.find( 'h1', { 'class':'articleHeading' } )
    art['title'] = headline.renderContents( None )
    art['title'] = ukmedia.FromHTML( art['title' ] )
    if art['title'].upper() == art['title']:
        art['title'] = ukmedia.UncapsTitle( art['title'] )      # don't like ALL CAPS HEADLINES!  

    introcopypara = wrapdiv.find( 'p', {'class': re.compile(r'\bintrocopy\b') } )
    art['description'] = ukmedia.FromHTMLOneLine( introcopypara.renderContents(None) )

    datepara = wrapdiv.find( 'p', {'class':'date'} )
    if datepara is None:
        #"<span class="date">Monday October 27 2008 <b> byEmily Garnham for express.co.uk</b>"
        datespan = wrapdiv.find( 'span', {'class':'date'} )
        bylineb = datespan.find( 'b' )
        if bylineb is not None:
            art['byline'] = ukmedia.FromHTMLOneLine( bylineb.renderContents(None).strip() )
            art['byline'] = re.sub( '([bB]y)([A-Z])', r'\1 \2', art['byline'] )
            bylineb.extract()
        else:
            if 'blog' in art['srcurl']:
                # blogs(?) have slightly different date/byline layout
                bylineb = wrapdiv.b
                art['byline'] = ukmedia.FromHTMLOneLine( bylineb.renderContents(None).strip() )
            else:
                art['byline'] = u''


        art['pubdate'] = ukmedia.ParseDateTime( datespan.renderContents(None).strip() )
        datespan.extract()        
    else:
        art['pubdate'] = ukmedia.ParseDateTime( datepara.renderContents(None).strip() )
        bylineh4 = wrapdiv.find( 'h4' )
        if bylineh4:
            art['byline'] = ukmedia.FromHTML(bylineh4.renderContents(None))
        else:
            # for some sections, try extracting a journo from the description...
            # (Express usually has names IN ALL CAPS, which the byline-o-matic
            # misses, so we'll turn anything likely-looking into titlecase
            # first).
            art['byline'] = u''
            if art['srcurl'].find('/travel/') != -1 or art['srcurl'].find('/motoring/') != -1:
                desc = ukmedia.DecapNames( art['description'] )
                art['byline'] = ukmedia.ExtractAuthorFromParagraph( desc )

    #comments
    art['commentlinks'] = []
    comment_cnt_pat = re.compile( "Have your say\s*[(](\d+)[)]" )
    num_comments = None
    comment_url = None
    for marker in soup.findAll( text=comment_cnt_pat ):
        if marker.parent.name != 'a':
            continue
        m = comment_cnt_pat.search( marker )
        if m:
            num_comments = int( m.group(1) )
            comment_url = urlparse.urljoin( art['srcurl'], '#comments' )
            art['commentlinks'].append( {'num_comments':num_comments, 'comment_url':comment_url} )
        break   # just the one.


    #images
    art['images'] = []
    for imgdiv in soup.findAll( 'div', {'class':'articleFirstImage'} ):
        img = imgdiv.find('img')
        im = { 'url': img['src'].strip(), 'caption':u'', 'credit': u'' }
        if im['url'].endswith( "/missingimage.gif" ):
            continue
        # find caption para
        # eg class="articleFirstImageCaption"
        capp = imgdiv.find('p',{'class':re.compile('caption$',re.IGNORECASE) } )
        if capp:
            im['caption'] = ukmedia.FromHTMLOneLine( capp.renderContents(None) ).strip()
        art['images'].append(im)

    # cruft removal - mismatched tags means that cruft can get drawn into
    # story paragraphs... sigh...

#   cruft = wrapdiv.find('a', {'name':'comments'} )
#   if cruft:
#       # delete _everything_ from the comments onward
#       n = cruft.next
#       cruft.extract()
#       cruft = n

    for cruft in wrapdiv.findAll('object'):
        cruft.extract()
    for cruft in wrapdiv.findAll('div',{'class':'right'}):
        cruft.extract()

    for cruft in wrapdiv.findAll('form' ):      # (search form etc )
        cruft.extract()

    for cruft_url_pat in ( re.compile("/creditadvice$"),re.compile("/money$") ):
        for cruft in wrapdiv.findAll( 'a', href=cruft_url_pat ):
            cruft.extract()

    # OK to build up text body now!
    textpart = BeautifulSoup()
    textpart.insert( len(textpart.contents), introcopypara )

    #for para in wrapdiv.findAll( 'p', ):   #{'class':'storycopy'} ):

    # sigh... sometimes express articles have nested paras, without the
    # "storycopy" class. probably due to cutting and pasting from another
    # source...
    for p in wrapdiv.findAll( 'p', {'class':'storycopy'} ):
        p.extract()
        textpart.append( p )

    content = textpart.prettify( None )
    content = ukmedia.DescapeHTML( content )
    content = ukmedia.SanitiseHTML( content )
    art['content'] = content

    if art['description'] == u'':
        art['description'] = ukmedia.FirstPara( content )

    return art

コード例 #13

0

ファイルを表示

ファイル: pyreadability.py プロジェクト: narqo/pyreadability

    def get_article_text(self, soup):
        
        orig_html = soup.renderContents()
        body = soup.body
        
        if not body:
            raise ReadabilityException()
        
        if self.FLAG_STRIP_UNLIKELYS in self.flags:
            for node in body.findAll(True, attrs={"id":UNLIKELY_CANDIDATES}):
                if (node.get("class") and OK_MAYBE_ITS_A_CANDIDATE.search(node["class"])) or \
                    OK_MAYBE_ITS_A_CANDIDATE.search(node["id"]):
                    continue
                node.extract()
                
            for node in body.findAll(True, attrs={"class":UNLIKELY_CANDIDATES}):
                if OK_MAYBE_ITS_A_CANDIDATE.search(node["class"]) or \
                    (node.get("id") and OK_MAYBE_ITS_A_CANDIDATE.search(node["id"])):
                    continue
                node.extract()
    
        # Replace div with paragraphs
        for div in body.findAll("div"):
            if not len(div.findAll(DIV_TO_P_ELEMENTS)):
                div.name = "p"
            
        nodes_to_score = body.findAll(re.compile("^p|td|pre$", re.I))
        
        # Loop through all paragraphs, and assign a score to them based on how content-y they look.
        # Then add their score to their parent node.
        # 
        # A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
        candidates = []
        for node in nodes_to_score:
            parent = node.parent
            if not parent:
                continue
            grand_parent = parent and parent.parent
            inner_text = self.get_inner_text(node)
            
            # If this paragraph is less than 25 characters, don't even count it.
            if len(inner_text) < 25:
                continue
            
            # Initialize readability data for the parent.
            if not getattr(parent, "readability", None):
                self.initialize_node(parent)
                candidates.append(parent)
                
            #  Initialize readability data for the grandparent.
            if grand_parent and not getattr(grand_parent, "readability", None):
                self.initialize_node(grand_parent)
                candidates.append(grand_parent)
                
            content_score = 0  
            
            # Add a point for the paragraph itself as a base.
            content_score += 1
            
            # Add points for any commas within this paragraph
            content_score += inner_text.count(",")
            
            # For every 100 characters in this paragraph, add another point. Up to 3 points.
            content_score += min(len(inner_text) / 100, 3)
            
            # Add the score to the parent. The grandparent gets half.
            parent.readability["content_score"] += content_score
            if grand_parent:
                grand_parent.readability["content_score"] += content_score / 2
            
        # After we've calculated scores, loop through all of the possible candidate nodes we found
        # and find the one with the highest score.    
        top_candidate = None
        for candidate in candidates:
            # Scale the final candidates score based on link density. Good content should have a
            # relatively small link density (5% or less) and be mostly unaffected by this operation.
            candidate.readability["content_score"] = candidate.readability["content_score"] * (1 - self.get_link_density(candidate))
            if not top_candidate or top_candidate.readability["content_score"] < candidate.readability["content_score"]:
                top_candidate = candidate

        # If we still have no top candidate, just use the body as a last resort.
        # We also have to copy the body node so it is something we can modify.
        if not top_candidate:
            top_candidate = Tag(soup, "div")
            for c in body:
                top_candidate.append(c.extract())
            body.append(top_candidate)
            self.initialize_node(top_candidate)
        
        article = BeautifulSoup("<div></div>").div

        # Now that we have the top candidate, look through its siblings for content that might also be related.
        # Things like preambles, content split by ads that we removed, etc.                 
        sibling_score_threshold = max(10, top_candidate.readability["content_score"] * 0.2)
        top_candidate_class = top_candidate.get("class", None)

        for sibling_node in top_candidate.parent:
            if not isinstance(sibling_node, Tag):
                continue
            
            append = False
            
            if sibling_node == top_candidate:
                append = True
            else:
                content_bonus = 0
                # Give a bonus if sibling nodes and top candidates have the example same classname
                if top_candidate_class and top_candidate_class == sibling_node.get("class", None):
                    content_bonus += top_candidate.readability["content_score"] * 0.2
                if getattr(sibling_node, "readability", None) and (sibling_node.readability["content_score"] + content_bonus) >= sibling_score_threshold:
                    append = True
                
                if sibling_node.name == "p":
                    link_density = self.get_link_density(sibling_node)
                    node_content = self.get_inner_text(sibling_node)
                    node_length = len(node_content)
                    if node_length > 80 and link_density < 0.25:
                        append = True
                    elif node_length <= 80 and link_density == 0 and re.search(r"\.( |$)", node_content):
                        append = True
                    
            if append:
                if sibling_node.name not in ("div", "p"):
                    # We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident.
                    sibling_node.name = "div"
                    
                # To ensure a node does not interfere with readability styles, remove its classnames
                if sibling_node["class"]:
                    del sibling_node["class"]
                article.append(sibling_node)
                    

        if len(article.renderContents()) < 250:
            soup = BeautifulSoup(orig_html)
            if self.FLAG_STRIP_UNLIKELYS in self.flags:
                self.flags.remove(self.FLAG_STRIP_UNLIKELYS)
                return self.get_article_text(soup)
            elif self.FLAG_WEIGHT_CLASSES in self.flags:
                self.flags.remove(self.FLAG_WEIGHT_CLASSES)
                return self.get_article_text(soup)
            elif self.FLAG_CLEAN_CONDITIONALLY in self.flags:
                self.flags.remove(self.FLAG_CLEAN_CONDITIONALLY)
                return self.get_article_text(soup)
            else:
                raise ReadabilityException()
            
        self.prepare_article(article)
            
        return article.renderContents(encoding=None)