Ejemplo n.º 1
0
 def clean_content(self, html, max_size, force_html, force_ascii):
     """Clean up downloaded content
     """
     if max_size is not None and len(html) > max_size:
         common.logger.info('Too big: %s' % len(html))
         html = '' # too big to store
     elif force_html and not common.is_html(html):
         common.logger.info('Not html')
         html = '' # non-html content
     elif force_ascii:
         html = common.to_ascii(html) # remove non-ascii characters
     return html
Ejemplo n.º 2
0
 def clean_content(self, html, max_size, force_html, force_ascii):
     """Clean up downloaded content
     """
     if max_size is not None and len(html) > max_size:
         common.logger.info('Too big: %s' % len(html))
         html = ''  # too big to store
     elif force_html and not common.is_html(html):
         common.logger.info('Not html')
         html = ''  # non-html content
     elif force_ascii:
         html = common.to_ascii(html)  # remove non-ascii characters
     return html
Ejemplo n.º 3
0
    def _clean_content(self, html, max_size, force_html, force_ascii):
        """Clean up downloaded content

        html:
            the input to clean
        max_size:
            the maximum size of data allowed
        force_html:
            content must be HTML
        force_ascii:
            content must be ASCII
        """
        if max_size is not None and len(html) > max_size:
            common.logger.info('Webpage is too big: %s' % len(html))
            html = '' # too big to store
        elif force_html and not common.is_html(html):
            common.logger.info('Webpage is not html')
            html = '' # non-html content
        elif force_ascii:
            html = common.to_ascii(html) # remove non-ascii characters
        return html
Ejemplo n.º 4
0
    def _clean_content(self, html, max_size, force_html, force_ascii):
        """Clean up downloaded content

        html:
            the input to clean
        max_size:
            the maximum size of data allowed
        force_html:
            content must be HTML
        force_ascii:
            content must be ASCII
        """
        if max_size is not None and len(html) > max_size:
            common.logger.info('Webpage is too big: %s' % len(html))
            html = ''  # too big to store
        elif force_html and not common.is_html(html):
            common.logger.info('Webpage is not html')
            html = ''  # non-html content
        elif force_ascii:
            html = common.to_ascii(html)  # remove non-ascii characters
        return html