コード例 #1
0
ファイル: download.py プロジェクト: gotomypc/webscraping
 def clean_content(self, html, max_size, force_html, force_ascii):
     """Clean up downloaded content
     """
     if max_size is not None and len(html) > max_size:
         common.logger.info('Too big: %s' % len(html))
         html = '' # too big to store
     elif force_html and not common.is_html(html):
         common.logger.info('Not html')
         html = '' # non-html content
     elif force_ascii:
         html = common.to_ascii(html) # remove non-ascii characters
     return html
コード例 #2
0
ファイル: download.py プロジェクト: w4lker/Antix
 def clean_content(self, html, max_size, force_html, force_ascii):
     """Clean up downloaded content
     """
     if max_size is not None and len(html) > max_size:
         common.logger.info('Too big: %s' % len(html))
         html = ''  # too big to store
     elif force_html and not common.is_html(html):
         common.logger.info('Not html')
         html = ''  # non-html content
     elif force_ascii:
         html = common.to_ascii(html)  # remove non-ascii characters
     return html
コード例 #3
0
ファイル: download.py プロジェクト: amumu/webscraping
    def _clean_content(self, html, max_size, force_html, force_ascii):
        """Clean up downloaded content

        html:
            the input to clean
        max_size:
            the maximum size of data allowed
        force_html:
            content must be HTML
        force_ascii:
            content must be ASCII
        """
        if max_size is not None and len(html) > max_size:
            common.logger.info('Webpage is too big: %s' % len(html))
            html = '' # too big to store
        elif force_html and not common.is_html(html):
            common.logger.info('Webpage is not html')
            html = '' # non-html content
        elif force_ascii:
            html = common.to_ascii(html) # remove non-ascii characters
        return html
コード例 #4
0
    def _clean_content(self, html, max_size, force_html, force_ascii):
        """Clean up downloaded content

        html:
            the input to clean
        max_size:
            the maximum size of data allowed
        force_html:
            content must be HTML
        force_ascii:
            content must be ASCII
        """
        if max_size is not None and len(html) > max_size:
            common.logger.info('Webpage is too big: %s' % len(html))
            html = ''  # too big to store
        elif force_html and not common.is_html(html):
            common.logger.info('Webpage is not html')
            html = ''  # non-html content
        elif force_ascii:
            html = common.to_ascii(html)  # remove non-ascii characters
        return html