Esempio n. 1
0
    def test_css_scraper_krokozyabry(self):
        scraper = CSSScraper()
        request = Request.new('http://example.com/styles.css')
        response = Response('HTTP/1.0', 200, 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                'testing', 'samples', 'krokozyabry.css')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual({
            'http://example.com/Кракозябры.png',
            },
            inline_urls
        )
        self.assertFalse(linked_urls)
Esempio n. 2
0
    def test_css_scraper_mojibake(self):
        scraper = CSSScraper()
        request = Request.new('http://example.com/styles.css')
        response = Response('HTTP/1.0', 200, 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                                          'testing', 'samples', 'mojibake.css')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual({
            'http://example.com/文字化け.png',
            },
            inline_urls
        )
        self.assertFalse(linked_urls)
Esempio n. 3
0
    def test_scrape_css_urls(self):
        text = '''
        @import url("fineprint.css") print;
        @import url("bluish.css") projection, tv;
        @import 'custom.css';
        @import url("chrome://communicator/skin/");
        @import "common.css" screen, projection;
        @import url('landscape.css') screen and (orientation:landscape);
        @import url(cool.css);
        @import warm.css;
        '''

        urls = set(CSSScraper.scrape_urls(text))

        self.assertEqual({
            'fineprint.css',
            'bluish.css',
            'chrome://communicator/skin/',
            'landscape.css',
            'cool.css'
            },
            urls
        )

        urls = set(CSSScraper.scrape_imports(text))

        self.assertEqual({
            'fineprint.css',
            'bluish.css',
            'custom.css',
            'chrome://communicator/skin/',
            'common.css',
            'landscape.css',
            'cool.css',
            'warm.css',
            },
            urls
        )
Esempio n. 4
0
    def test_scrape_css_urls(self):
        text = '''
        @import url("fineprint.css") print;
        @import url("bluish.css") projection, tv;
        @import 'custom.css';
        @import url("chrome://communicator/skin/");
        @import "common.css" screen, projection;
        @import url('landscape.css') screen and (orientation:landscape);
        @import url(cool.css);
        @import warm.css;
        '''

        urls = set(CSSScraper.scrape_urls(text))

        self.assertEqual({
            'fineprint.css',
            'bluish.css',
            'chrome://communicator/skin/',
            'landscape.css',
            'cool.css'
            },
            urls
        )

        urls = set(CSSScraper.scrape_imports(text))

        self.assertEqual({
            'fineprint.css',
            'bluish.css',
            'custom.css',
            'chrome://communicator/skin/',
            'common.css',
            'landscape.css',
            'cool.css',
            'warm.css',
            },
            urls
        )
Esempio n. 5
0
    def convert_by_record(self, url_record):
        '''Convert using given URL Record.'''
        filename = url_record.filename

        if not os.path.exists(filename):
            return

        if url_record.link_type:
            if url_record.link_type not in ('css', 'html'):
                return
            else:
                link_type = url_record.link_type
        else:
            with open(filename, 'rb') as in_file:
                if HTMLScraper.is_supported(file=in_file,
                                            url_info=url_record.url_info):
                    link_type = 'html'
                elif CSSScraper.is_supported(file=in_file,
                                             url_info=url_record.url_info):
                    link_type = 'css'
                else:
                    link_type = None

        _logger.info(
            _('Converting links in file ‘{filename}’ (type={type}).')\
            .format(filename=filename, type=link_type)
        )

        if self._backup_enabled:
            shutil.copy2(filename, filename + '.orig')

        temp_filename = filename + '-new'

        if link_type == 'css':
            self._css_converter.convert(filename,
                                        temp_filename,
                                        base_url=url_record.url)
        elif link_type == 'html':
            self._html_converter.convert(filename,
                                         temp_filename,
                                         base_url=url_record.url)
        else:
            raise Exception('Unknown link type.')

        os.remove(filename)
        os.rename(temp_filename, filename)
Esempio n. 6
0
    def convert_by_record(self, url_record):
        '''Convert using given URL Record.'''
        filename = url_record.filename

        if not os.path.exists(filename):
            return

        if url_record.link_type:
            if url_record.link_type not in ('css', 'html'):
                return
            else:
                link_type = url_record.link_type
        else:
            with open(filename, 'rb') as in_file:
                if HTMLScraper.is_supported(
                        file=in_file, url_info=url_record.url_info):
                    link_type = 'html'
                elif CSSScraper.is_supported(
                        file=in_file, url_info=url_record.url_info):
                    link_type = 'css'
                else:
                    link_type = None

        _logger.info(__(
            _('Converting links in file ‘{filename}’ (type={type}).'),
            filename=filename, type=link_type
        ))

        if self._backup_enabled:
            shutil.copy2(filename, filename + '.orig')

        temp_filename = filename + '-new'

        if link_type == 'css':
            self._css_converter.convert(
                filename, temp_filename, base_url=url_record.url)
        elif link_type == 'html':
            self._html_converter.convert(
                filename, temp_filename, base_url=url_record.url)
        else:
            raise Exception('Unknown link type.')

        os.remove(filename)
        os.rename(temp_filename, filename)
Esempio n. 7
0
    def convert_by_record(self, url_record):
        '''Convert using given URL Record.'''
        filename = self._path_namer.get_filename(URLInfo.parse(url_record.url))

        if not os.path.exists(filename):
            return

        if url_record.link_type:
            if url_record.link_type not in ('css', 'html'):
                return
            else:
                link_type = url_record.link_type
        else:
            with open(filename, 'rb') as in_file:
                if HTMLScraper.is_supported(in_file,
                                            url_info=url_record.url_info):
                    link_type = 'html'
                elif CSSScraper.is_supported(in_file,
                                             url_info=url_record.url_info):
                    link_type = 'css'
                else:
                    link_type = None

        _logger.info(
            _('Converting links in file ‘{filename}’ (type={type}).')\
            .format(filename=filename, type=link_type)
        )

        if self._backup_enabled:
            shutil.copy2(filename, filename + '.orig')

        if link_type == 'css':
            self._css_converter.convert(filename,
                                        filename,
                                        base_url=url_record.url)
        elif link_type == 'html':
            self._html_converter.convert(filename,
                                         filename,
                                         base_url=url_record.url)
Esempio n. 8
0
    def convert_by_record(self, url_record):
        '''Convert using given URL Record.'''
        filename = self._path_namer.get_filename(
            URLInfo.parse(url_record.url)
        )

        if not os.path.exists(filename):
            return

        if url_record.link_type:
            if url_record.link_type not in ('css', 'html'):
                return
            else:
                link_type = url_record.link_type
        else:
            with open(filename, 'rb') as in_file:
                if HTMLScraper.is_supported(
                in_file, url_info=url_record.url_info):
                    link_type = 'html'
                elif CSSScraper.is_supported(
                in_file, url_info=url_record.url_info):
                    link_type = 'css'
                else:
                    link_type = None

        _logger.info(
            _('Converting links in file ‘{filename}’ (type={type}).')\
            .format(filename=filename, type=link_type)
        )

        if self._backup_enabled:
            shutil.copy2(filename, filename + '.orig')

        if link_type == 'css':
            self._css_converter.convert(
                filename, filename, base_url=url_record.url)
        elif link_type == 'html':
            self._html_converter.convert(
                filename, filename, base_url=url_record.url)