def clean_content(self, html, max_size, force_html, force_ascii): """Clean up downloaded content """ if max_size is not None and len(html) > max_size: common.logger.info('Too big: %s' % len(html)) html = '' # too big to store elif force_html and not common.is_html(html): common.logger.info('Not html') html = '' # non-html content elif force_ascii: html = common.to_ascii(html) # remove non-ascii characters return html
def _clean_content(self, html, max_size, force_html, force_ascii): """Clean up downloaded content html: the input to clean max_size: the maximum size of data allowed force_html: content must be HTML force_ascii: content must be ASCII """ if max_size is not None and len(html) > max_size: common.logger.info('Webpage is too big: %s' % len(html)) html = '' # too big to store elif force_html and not common.is_html(html): common.logger.info('Webpage is not html') html = '' # non-html content elif force_ascii: html = common.to_ascii(html) # remove non-ascii characters return html