FILE.close() except IOError, error: log.warning(unicode(error)) self.article.content = _(u"Unable to read file: %s.") % path return page = u'<div>' + unicode(page, "utf8") + u'</div>' # FIXME avoid problems with numeric entities in attributes page = page.replace(u' ', u' ') # avoidParserProblems is set to False because BeautifulSoup's # cleanup was causing a "concatenating Null+Str" error, # and Wikipedia's HTML doesn't need cleaning up. # BeautifulSoup is faster this way too. soup = BeautifulSoup(page, False) content = soup.first('div') # remove the wiktionary, wikimedia commons, and categories boxes # and the protected icon and the needs citations box if content: infoboxes = content.findAll('div', {'class': 'infobox sisterproject'}) [infobox.extract() for infobox in infoboxes] catboxes = content.findAll('div', {'id': 'catlinks'}) [catbox.extract() for catbox in catboxes] amboxes = content.findAll('table', {'class': re.compile(r'.*\bambox\b.*')}) [ambox.extract() for ambox in amboxes] protecteds = content.findAll('div', {'id': 'protected-icon'}) [protected.extract() for protected in protecteds] else:
FILE.close() except IOError, error: log.warning(unicode(error)) self.article.content = _(u"Unable to read file: %s.") % path return page = u'<div>' + unicode(page, "utf8") + u'</div>' # FIXME avoid problems with numeric entities in attributes page = page.replace(u' ', u' ') # avoidParserProblems is set to False because BeautifulSoup's # cleanup was causing a "concatenating Null+Str" error, # and Wikipedia's HTML doesn't need cleaning up. # BeautifulSoup is faster this way too. soup = BeautifulSoup(page, False) content = soup.first('div') # remove the wiktionary, wikimedia commons, and categories boxes # and the protected icon and the needs citations box if content: infoboxes = content.findAll('div', {'class' : 'infobox sisterproject'}) [infobox.extract() for infobox in infoboxes] catboxes = content.findAll('div', {'id' : 'catlinks'}) [catbox.extract() for catbox in catboxes] amboxes = content.findAll('table', {'class' : re.compile(r'.*\bambox\b.*')}) [ambox.extract() for ambox in amboxes] protecteds = content.findAll('div', {'id' : 'protected-icon'}) [protected.extract() for protected in protecteds] else: