Ejemplo n.º 1
0
 def test_get_response_text(self):
     text = 'Example'
     response = FakedObject()
     response.read = lambda: text
     result = self.linkExtractor._get_response_text(response,
                                                    func_name='read')
     self.assertTrue(isinstance(result, str))
     self.assertEqual(text, result)
Ejemplo n.º 2
0
    def test_get_response(self):
        from common_crawler.http import Response
        from lxml.etree import _Element

        async def text():
            return 'HTML'

        response = FakedObject(url=_TARGET_URL,
                               status=200,
                               charset='utf-8',
                               content_type='text/html',
                               content_length=233,
                               reason='OK',
                               headers={'connection': 'keep-alive'})

        response.text = text

        async def work():
            async with AioHttpClient() as client:
                expected = await client.get_response(response)

                self.assertTrue(isinstance(expected, Response))

                self.assertTrue(isinstance(expected.url, str))
                self.assertEqual(_TARGET_URL, expected.url)

                self.assertTrue(isinstance(expected.text, str))
                self.assertEqual('HTML', expected.text)

                self.assertTrue(isinstance(expected.status, int))
                self.assertEqual(expected.status, 200)

                self.assertTrue(isinstance(expected.charset, str))
                self.assertEqual(expected.charset, 'utf-8')

                self.assertTrue(isinstance(expected.content_type, str))
                self.assertEqual(expected.content_type, 'text/html')

                self.assertTrue(isinstance(expected.content_length, int))
                self.assertEqual(expected.content_length, 233)

                self.assertTrue(isinstance(expected.reason, str))
                self.assertEqual(expected.reason, 'OK')

                self.assertTrue(isinstance(expected.headers, dict))
                self.assertTrue('connection' in expected.headers)
                self.assertEqual(expected.headers['connection'], 'keep-alive')

                self.assertTrue(isinstance(expected.selector, _Element))

        _LOOP.run_until_complete(work())
    def test_clean_for_finished_urls(self):
        engine = self._get_default_engine()
        engine.crawler.finished_urls = [FakedObject(url='f'),
                                        FakedObject(url='g'),
                                        None,
                                        FakedObject(url='d'),
                                        None,
                                        FakedObject(url='a')]

        engine._clean_for_finished_urls()
        self.assertEqual(len(engine.crawler.finished_urls), 4)
        self.assertEqual(engine.crawler.finished_urls[0].url, 'a')
        self.assertEqual(engine.crawler.finished_urls[1].url, 'd')
        self.assertEqual(engine.crawler.finished_urls[2].url, 'f')
        self.assertEqual(engine.crawler.finished_urls[3].url, 'g')
Ejemplo n.º 4
0
    def test_restrict_css(self):
        css_name_01 = "container"
        css_name_02 = "btn"
        url_01 = "https://www.python.org"
        url_02 = "http://www.google.com"

        html = """
        <html>  
            <head></head>
            <body>
                <div id="div01" value="%s" class="%s">div01</div>
                <div id="div02" value="%s" class="%s">div02</div>
                <div id="div03">div03</div>
            </body>
        </html>
        """ % (url_01, css_name_01, url_02, css_name_02)

        linkExtractor = LxmlLinkExtractor(tags=('div', ),
                                          attrs=('value', ),
                                          restrict_css=('.%s' % css_name_01,
                                                        '.%s' % css_name_02))

        response = FakedObject(url="https://www.example.com", text=html)
        links = linkExtractor.extract_links(response)

        self.assertEqual(len(links), 2)
        self.assertEqual(links[0].url, url_01)
        self.assertEqual(links[0].text, 'div01')
        self.assertEqual(links[1].url, url_02)
        self.assertEqual(links[1].text, 'div02')
 def setUp(self):
     self.response = FakedObject(url=_URL,
                                 status=_STATUS,
                                 headers=_HEADERS,
                                 charset=_CHARSET,
                                 content_type=_CONTENT_TYPE,
                                 content_length=_CONTENT_LENGTH,
                                 reason=_REASON)
     self.response.text = _TEXT
    def add_to_task_queue(self, url):
        if isinstance(url, str):
            url = [url]

        for u in url:
            self.task_queue.put_nowait(FakedObject(
                url=u,
                parsed_data=None,
                exception=None,
                redirect_num=0,
                retries_num=0,
                redirect_url=None,
                response=FakedObject(status=200,
                                     charset='utf-8',
                                     content_type='text/html',
                                     content_length=None,
                                     reason='OK',
                                     headers=None)
            ))
Ejemplo n.º 7
0
    def test_head(self, mocked):
        status = 200
        mocked.return_value = FakedObject(status=status)

        async def work():
            async with AioHttpClient() as client:
                async with client.head(url=_TARGET_URL) as resp:
                    self.assertEqual(status, resp.status)

        _LOOP.run_until_complete(work())
        mocked.assert_called_once_with(url=_TARGET_URL)
    def setUp(self):
        self.configuration = {
            'name': 'common_crawler',
            'roots': ('http://www.example.com',),
            'deny_domains': (),
            'allow_domains': (),
            'strict': True,
            'follow': True,
            'allowed_rule': (),
            'denied_rule': (),
            'log_level': 2,
            'log_filename': None,
            'log_format': '%(asctime)s:%(levelname)s:%(message)s',
            'log_init_fn': 'common_crawler.engines._init_logging',
            'max_redirect': 10,
            'max_retries': 4,
            'max_tasks': 100,
            'interval': 1
        }

        self.task = FakedObject(url='https://www.google.com',
                                parsed_data=None,
                                exception=None,
                                redirect_num=0,
                                retries_num=0,
                                redirect_url=None,
                                )
        self.task.response = FakedObject(url=self.task.url,
                                         status=200,
                                         charset='utf-8',
                                         content_type='text/html',
                                         content_length=None,
                                         reason='OK',
                                         headers=None,
                                         text='<html><body><a href="/abc"></a></body></html>')

        self.task_queue = asyncio.Queue()
        self.http_client = FakedObject()
        self.parse_link = lambda x: print('Parsing...')
Ejemplo n.º 9
0
    def setUp(self):
        self.open_path = 'builtins.open'
        url = 'https://www.example.com'

        response = FakedObject(url=url,
                               status=200,
                               charset='utf-8',
                               content_type=None,
                               content_length=None,
                               reason=None,
                               headers=None,
                               text=None)

        self.task = Task(url=url, parsed_data='Text', response=response)
    def test_crawl_as_redirect(self, mocked):
        mocked.return_value = FakedObject(
            url=_URL,
            status=301,
            headers={'location': 'https://www.python.org'},
            charset=_CHARSET,
            content_type=_CONTENT_TYPE,
            content_length=_CONTENT_LENGTH,
            reason=_REASON,
            text=_TEXT)
        list = []

        async def work(crawler):
            async for t in crawler.crawl():
                list.append(t)

        crawler = AsyncCrawler(roots=_URL)
        launcher = AsyncCrawlerLauncher(crawler=crawler, work=work)
        launcher.run()
        self.assertEqual(0, len(list))
Ejemplo n.º 11
0
    def setUp(self):
        allow = ('http[s]?:\/\/www.google.com\/?\w*')
        deny = ('http[s]?:\/\/www.google.com\/?hello')
        deny_domains = ['www.amazon.com']
        deny_extensions = ['mp3', 'pdf', 'ppt']
        html = '''
            <html>
                <head></head>
                <body>
                    <a href="https://www.google.com/">Google</a>
                    <a href="https://www.google.com/">Google2</a>
                    <a href="/python">Python</a>
                    <a href="/world">World</a>
                </body>
            </html>
        '''

        response = FakedObject(url='https://www.google.com', text=html)
        self.response = response
        self.linkExtractor = LxmlLinkExtractor(allow=allow,
                                               deny=deny,
                                               deny_domains=deny_domains,
                                               deny_extensions=deny_extensions)
 def __init__(self, **kwargs):
     super(self.__class__, self).__init__(**kwargs)
     self.return_val = FakedObject(url='https://www.link_extractor.com')
     self.count = 0