Example #1
0
            FILE.close()
        except IOError, error:
            log.warning(unicode(error))
            self.article.content = _(u"Unable to read file: %s.") % path
            return

        page = u'<div>' + unicode(page, "utf8") + u'</div>'
        # FIXME avoid problems with numeric entities in attributes
        page = page.replace(u'&#160;', u'&nbsp;')

        # avoidParserProblems is set to False because BeautifulSoup's
        # cleanup was causing a "concatenating Null+Str" error,
        # and Wikipedia's HTML doesn't need cleaning up.
        # BeautifulSoup is faster this way too.
        soup = BeautifulSoup(page, False)
        content = soup.first('div')

        # remove the wiktionary, wikimedia commons, and categories boxes
        #  and the protected icon and the needs citations box
        if content:
            infoboxes = content.findAll('div',
                                        {'class': 'infobox sisterproject'})
            [infobox.extract() for infobox in infoboxes]
            catboxes = content.findAll('div', {'id': 'catlinks'})
            [catbox.extract() for catbox in catboxes]
            amboxes = content.findAll('table',
                                      {'class': re.compile(r'.*\bambox\b.*')})
            [ambox.extract() for ambox in amboxes]
            protecteds = content.findAll('div', {'id': 'protected-icon'})
            [protected.extract() for protected in protecteds]
        else:
Example #2
0
            FILE.close()
        except IOError, error:
            log.warning(unicode(error))
            self.article.content = _(u"Unable to read file: %s.") % path
            return

        page = u'<div>' + unicode(page, "utf8") + u'</div>'
        # FIXME avoid problems with numeric entities in attributes
        page = page.replace(u'&#160;', u'&nbsp;')

        # avoidParserProblems is set to False because BeautifulSoup's
        # cleanup was causing a "concatenating Null+Str" error,
        # and Wikipedia's HTML doesn't need cleaning up.
        # BeautifulSoup is faster this way too.
        soup = BeautifulSoup(page, False)
        content = soup.first('div')

        # remove the wiktionary, wikimedia commons, and categories boxes
        #  and the protected icon and the needs citations box
        if content:
            infoboxes = content.findAll('div',
                    {'class' : 'infobox sisterproject'})
            [infobox.extract() for infobox in infoboxes]
            catboxes = content.findAll('div', {'id' : 'catlinks'})
            [catbox.extract() for catbox in catboxes]
            amboxes = content.findAll('table',
                    {'class' : re.compile(r'.*\bambox\b.*')})
            [ambox.extract() for ambox in amboxes]
            protecteds = content.findAll('div', {'id' : 'protected-icon'})
            [protected.extract() for protected in protecteds]
        else: