Example #1
0
 def test_grab_parse_defensedxml(self):
     with temp_dir() as tmp_dir:
         injection_path = os.path.join(tmp_dir, 'injection')
         with open(injection_path, 'w') as out:
             out.write('Hey there!')
         # Prepare file:// URL valid for both linux and windows
         injection_url = 'file:///%s' % (injection_path.lstrip('/')
                                         .replace('\\', '/'))
         bad_xml = (
             '<!DOCTYPE external ['
             '<!ENTITY ee SYSTEM "' + injection_url + '">'
             ']>'
             '<root>&ee;</root>'
         ).encode()
         xml_file = os.path.join(tmp_dir, 'bad.xml')
         # On windows, use slashed instead of backslashes to avoid error:
         # Invalid file://hostname/, expected localhost or 127.0.0.1 or none
         if '\\' in xml_file:
             xml_file = xml_file.replace('\\', '/')
         with open(xml_file, 'wb') as out:
             out.write(bad_xml)
         grab = build_grab(content_type='xml')
         file_url = 'file://%s' % xml_file
         grab.go(file_url)
         self.assertRaises(EntitiesForbidden, grab.doc, '//title')
Example #2
0
 def test_transport_option_as_string_fake(self):
     with temp_dir() as dir_:
         sys.path.insert(0, dir_)
         with open(os.path.join(dir_, 'foo.py'), 'w') as out:
             out.write(FAKE_TRANSPORT_CODE)
         self.assert_transport_response('foo.FakeTransport', b'XYZ')
         sys.path.remove(dir_)
    def test_body_inmemory_false(self):
        with temp_dir() as tmp_dir:
            grab = build_grab()
            grab.setup(body_inmemory=False)
            with self.assertRaises(GrabMisuseError):
                grab.go(self.server.get_url())

            self.server.response['get.data'] = b'foo'
            grab = build_grab()
            grab.setup(body_inmemory=False)
            grab.setup(body_storage_dir=tmp_dir)
            grab.go(self.server.get_url())
            self.assertTrue(os.path.exists(grab.doc.body_path))
            self.assertTrue(tmp_dir in grab.doc.body_path)
            with open(grab.doc.body_path, 'rb') as inp:
                self.assertEqual(b'foo', inp.read())
            # pylint: disable=protected-access
            self.assertEqual(grab.doc._bytes_body, None)
            # pylint: enable=protected-access
            old_path = grab.doc.body_path

            grab.go(self.server.get_url())
            self.assertTrue(old_path != grab.doc.body_path)

        with temp_dir() as tmp_dir:
            self.server.response['get.data'] = 'foo'
            grab = build_grab()
            grab.setup(body_inmemory=False)
            grab.setup(body_storage_dir=tmp_dir)
            grab.setup(body_storage_filename='music.mp3')
            grab.go(self.server.get_url())
            self.assertTrue(os.path.exists(grab.doc.body_path))
            self.assertTrue(tmp_dir in grab.doc.body_path)
            with open(grab.doc.body_path, 'rb') as inp:
                self.assertEqual(b'foo', inp.read())
            self.assertEqual(os.path.join(tmp_dir, 'music.mp3'),
                             grab.doc.body_path)
            self.assertEqual(grab.doc.body, b'foo')
            # pylint: disable=protected-access
            self.assertEqual(grab.doc._bytes_body, None)
Example #4
0
    def test_body_inmemory_false(self):
        with temp_dir() as tmp_dir:
            grab = build_grab()
            grab.setup(body_inmemory=False)
            with self.assertRaises(GrabMisuseError):
                grab.go(self.server.get_url())

            self.server.response['get.data'] = b'foo'
            grab = build_grab()
            grab.setup(body_inmemory=False)
            grab.setup(body_storage_dir=tmp_dir)
            grab.go(self.server.get_url())
            self.assertTrue(os.path.exists(grab.doc.body_path))
            self.assertTrue(tmp_dir in grab.doc.body_path)
            with open(grab.doc.body_path, 'rb') as inp:
                self.assertEqual(b'foo', inp.read())
            # pylint: disable=protected-access
            self.assertEqual(grab.doc._bytes_body, None)
            # pylint: enable=protected-access
            old_path = grab.doc.body_path

            grab.go(self.server.get_url())
            self.assertTrue(old_path != grab.doc.body_path)

        with temp_dir() as tmp_dir:
            self.server.response['get.data'] = 'foo'
            grab = build_grab()
            grab.setup(body_inmemory=False)
            grab.setup(body_storage_dir=tmp_dir)
            grab.setup(body_storage_filename='music.mp3')
            grab.go(self.server.get_url())
            self.assertTrue(os.path.exists(grab.doc.body_path))
            self.assertTrue(tmp_dir in grab.doc.body_path)
            with open(grab.doc.body_path, 'rb') as inp:
                self.assertEqual(b'foo', inp.read())
            self.assertEqual(os.path.join(tmp_dir, 'music.mp3'),
                             grab.doc.body_path)
            self.assertEqual(grab.doc.body, b'foo')
            # pylint: disable=protected-access
            self.assertEqual(grab.doc._bytes_body, None)
Example #5
0
    def test_save_hash(self):
        "Test `Response.save_hash` method."
        with temp_dir() as tmp_dir:
            with open(IMG_FILE, 'rb') as inp:
                img_data = inp.read()
            self.server.response['get.data'] = img_data

            grab = build_grab()
            grab.go(self.server.get_url())
            path = grab.doc.save_hash(self.server.get_url(), tmp_dir)
            with open(os.path.join(tmp_dir, path), 'rb') as inp:
                test_data = inp.read()
            self.assertEqual(test_data, img_data)
Example #6
0
    def test_save(self):
        "Test `Response.save` method."
        with temp_dir() as tmp_dir:
            with open(IMG_FILE, 'rb') as inp:
                img_data = inp.read()
            tmp_file = os.path.join(tmp_dir, 'file.bin')
            self.server.response['get.data'] = img_data

            grab = build_grab()
            grab.go(self.server.get_url())
            grab.doc.save(tmp_file)
            with open(tmp_file, 'rb') as inp:
                self.assertEqual(inp.read(), img_data)
Example #7
0
    def test_log_option(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            log_file_path = os.path.join(tmp_dir, 'lograb.html')
            grab = build_grab()
            grab.setup(log_file=log_file_path)
            self.server.response['get.data'] = 'omsk'

            self.assertEqual(os.listdir(tmp_dir), [])
            grab.go(self.server.get_url())
            self.assertEqual(os.listdir(tmp_dir), ['lograb.html'])
            with open(log_file_path) as inp:
                self.assertEqual(inp.read(), 'omsk')
Example #8
0
    def test_log_option(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            log_file_path = os.path.join(tmp_dir, 'lograb.html')
            grab = build_grab()
            grab.setup(log_file=log_file_path)
            self.server.response['get.data'] = 'omsk'

            self.assertEqual(os.listdir(tmp_dir), [])
            grab.go(self.server.get_url())
            self.assertEqual(os.listdir(tmp_dir), ['lograb.html'])
            with open(log_file_path) as inp:
                self.assertEqual(inp.read(), 'omsk')
Example #9
0
 def test_lxml_security_bug(self):
     with temp_dir() as tmp_dir:
         injection_path = os.path.join(tmp_dir, 'injection')
         with open(injection_path, 'w') as out:
             out.write('Hey there!')
         # Prepare file:// URL valid for both linux and windows
         injection_url = 'file:///%s' % (injection_path.lstrip('/').replace(
             '\\', '/'))
         bad_xml = ('<!DOCTYPE external ['
                    '<!ENTITY ee SYSTEM "' + injection_url + '">'
                    ']>'
                    '<root>&ee;</root>').encode()
         tree = parse(BytesIO(bad_xml))
         self.assertEqual(tree.xpath('//root/text()')[0], 'Hey there!')
Example #10
0
    def test_log_dir_response_content(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir)
            self.server.response['get.data'] = 'omsk'
            self.server.response['headers'] = [('X-Engine', 'PHP')]

            self.assertEqual(os.listdir(tmp_dir), [])
            grab.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log'])
            with open(os.path.join(tmp_dir, '01.log')) as inp:
                log_file_content = inp.read()
            self.assertTrue('x-engine' in log_file_content.lower())
Example #11
0
    def test_log_dir_response_content(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir)
            self.server.response['get.data'] = 'omsk'
            self.server.response['headers'] = [('X-Engine', 'PHP')]

            self.assertEqual(os.listdir(tmp_dir), [])
            grab.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log'])
            with open(os.path.join(tmp_dir, '01.log')) as inp:
                log_file_content = inp.read()
            self.assertTrue('x-engine' in log_file_content.lower())
Example #12
0
    def test_log_dir_request_content_is_empty(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir)
            grab.setup(headers={'X-Name': 'spider'}, post='xxxPost')

            self.assertEqual(os.listdir(tmp_dir), [])
            grab.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log'])
            with open(os.path.join(tmp_dir, '01.log')) as inp:
                log_file_content = inp.read()
            self.assertFalse('X-Name' in log_file_content)
            self.assertFalse('xxxPost' in log_file_content)
Example #13
0
 def test_lxml_security_bug(self):
     with temp_dir() as tmp_dir:
         injection_path = os.path.join(tmp_dir, 'injection')
         with open(injection_path, 'w') as out:
             out.write('Hey there!')
         # Prepare file:// URL valid for both linux and windows
         injection_url = 'file:///%s' % (injection_path.lstrip('/')
                                         .replace('\\', '/'))
         bad_xml = (
             '<!DOCTYPE external ['
             '<!ENTITY ee SYSTEM "' + injection_url + '">'
             ']>'
             '<root>&ee;</root>'
         ).encode()
         tree = parse(BytesIO(bad_xml))
         self.assertEqual(tree.xpath('//root/text()')[0], 'Hey there!')
Example #14
0
    def test_log_dir_request_content_is_empty(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir)
            grab.setup(headers={'X-Name': 'spider'}, post='xxxPost')

            self.assertEqual(os.listdir(tmp_dir), [])
            grab.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log'])
            with open(os.path.join(tmp_dir, '01.log')) as inp:
                log_file_content = inp.read()
            self.assertFalse('X-Name' in log_file_content)
            self.assertFalse('xxxPost' in log_file_content)
Example #15
0
 def test_grab_parse_defensedxml(self):
     with temp_dir() as tmp_dir:
         injection_path = os.path.join(tmp_dir, 'injection')
         with open(injection_path, 'w') as out:
             out.write('Hey there!')
         # Prepare file:// URL valid for both linux and windows
         injection_url = 'file:///%s' % (injection_path.lstrip('/').replace(
             '\\', '/'))
         bad_xml = ('<!DOCTYPE external ['
                    '<!ENTITY ee SYSTEM "' + injection_url + '">'
                    ']>'
                    '<root>&ee;</root>').encode()
         xml_file = os.path.join(tmp_dir, 'bad.xml')
         with open(xml_file, 'wb') as out:
             out.write(bad_xml)
         grab = build_grab(content_type='xml')
         grab.go('file://%s' % xml_file)
         self.assertRaises(EntitiesForbidden, grab.doc, '//title')
Example #16
0
    def test_log_dir_option(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir)
            self.server.response_once['get.data'] = 'omsk1'
            self.server.response['get.data'] = 'omsk2'

            self.assertEqual(os.listdir(tmp_dir), [])
            grab.go(self.server.get_url())
            grab.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log', '02.html', '02.log'])
            with open(os.path.join(tmp_dir, '01.html')) as inp:
                self.assertEqual(inp.read(), 'omsk1')
            with open(os.path.join(tmp_dir, '02.html')) as inp:
                self.assertEqual(inp.read(), 'omsk2')
Example #17
0
    def test_log_dir_option(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir)
            self.server.response_once['get.data'] = 'omsk1'
            self.server.response['get.data'] = 'omsk2'

            self.assertEqual(os.listdir(tmp_dir), [])
            grab.go(self.server.get_url())
            grab.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log', '02.html', '02.log'])
            with open(os.path.join(tmp_dir, '01.html')) as inp:
                self.assertEqual(inp.read(), 'omsk1')
            with open(os.path.join(tmp_dir, '02.html')) as inp:
                self.assertEqual(inp.read(), 'omsk2')
Example #18
0
    def test_log_dir_request_content_headers_and_post(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir, debug=True)
            grab.setup(headers={'X-Name': 'spider'}, post={'xxx': 'Post'})

            self.assertEqual(os.listdir(tmp_dir), [])
            grab.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log'])
            with open(os.path.join(tmp_dir, '01.log')) as inp:
                log_file_content = inp.read()
            #if not 'x-name' in log_file_content.lower():
            #    print('CONTENT OF 01.log:')
            #    print(log_file_content)
            self.assertTrue('x-name' in log_file_content.lower())
            self.assertTrue('xxx=post' in log_file_content.lower())
Example #19
0
    def test_log_dir_request_content_headers_and_post(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir, debug=True)
            grab.setup(headers={'X-Name': 'spider'}, post={'xxx': 'Post'})

            self.assertEqual(os.listdir(tmp_dir), [])
            grab.go(self.server.get_url())
            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log'])
            with open(os.path.join(tmp_dir, '01.log')) as inp:
                log_file_content = inp.read()
            #if not 'x-name' in log_file_content.lower():
            #    print('CONTENT OF 01.log:')
            #    print(log_file_content)
            self.assertTrue('x-name' in log_file_content.lower())
            self.assertTrue('xxx=post' in log_file_content.lower())
Example #20
0
    def test_log_dir_response_network_error(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir, timeout=1, user_agent='Perl',
                       debug=True)
            self.server.response['get.data'] = 'omsk'
            self.server.response['headers'] = [('X-Engine', 'PHP')]
            self.server.response['sleep'] = 2

            self.assertEqual(os.listdir(tmp_dir), [])
            try:
                grab.go(self.server.get_url())
            except GrabTimeoutError:
                pass

            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log'])
            with open(os.path.join(tmp_dir, '01.log')) as inp:
                log_file_content = inp.read()
            self.assertTrue('user-agent: perl' in log_file_content.lower())
Example #21
0
 def test_grab_parse_defensedxml(self):
     with temp_dir() as tmp_dir:
         injection_path = os.path.join(tmp_dir, 'injection')
         with open(injection_path, 'w') as out:
             out.write('Hey there!')
         # Prepare file:// URL valid for both linux and windows
         injection_url = 'file:///%s' % (injection_path.lstrip('/').replace(
             '\\', '/'))
         bad_xml = ('<!DOCTYPE external ['
                    '<!ENTITY ee SYSTEM "' + injection_url + '">'
                    ']>'
                    '<root>&ee;</root>').encode()
         xml_file = os.path.join(tmp_dir, 'bad.xml')
         # On windows, use slashed instead of backslashes to avoid error:
         # Invalid file://hostname/, expected localhost or 127.0.0.1 or none
         if '\\' in xml_file:
             xml_file = xml_file.replace('\\', '/')
         with open(xml_file, 'wb') as out:
             out.write(bad_xml)
         grab = build_grab(content_type='xml')
         file_url = 'file://%s' % xml_file
         grab.go(file_url)
         self.assertRaises(EntitiesForbidden, grab.doc, '//title')
Example #22
0
    def test_log_dir_response_content_thread(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir)
            self.server.response['get.data'] = 'omsk'
            self.server.response['headers'] = [('X-Engine', 'PHP')]

            self.assertEqual(os.listdir(tmp_dir), [])

            def func():
                grab.go(self.server.get_url())
            thread = threading.Thread(target=func)
            thread.start()
            thread.join()

            files = os.listdir(tmp_dir)
            self.assertEqual(2, len([x for x in files if '01-thread' in x]))
            fname = [x for x in files if x.endswith('.log')][0]
            with open(os.path.join(tmp_dir, fname)) as inp:
                log_file_content = inp.read()
            self.assertTrue('x-engine' in log_file_content.lower())
Example #23
0
    def test_log_dir_response_content_thread(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir)
            self.server.response['get.data'] = 'omsk'
            self.server.response['headers'] = [('X-Engine', 'PHP')]

            self.assertEqual(os.listdir(tmp_dir), [])

            def func():
                grab.go(self.server.get_url())

            thread = threading.Thread(target=func)
            thread.start()
            thread.join()

            files = os.listdir(tmp_dir)
            self.assertEqual(2, len([x for x in files if '01-thread' in x]))
            fname = [x for x in files if x.endswith('.log')][0]
            with open(os.path.join(tmp_dir, fname)) as inp:
                log_file_content = inp.read()
            self.assertTrue('x-engine' in log_file_content.lower())
Example #24
0
    def test_log_dir_response_network_error(self):
        with temp_dir() as tmp_dir:
            reset_request_counter()

            grab = build_grab()
            grab.setup(log_dir=tmp_dir,
                       timeout=1,
                       user_agent='Perl',
                       debug=True)
            self.server.response['get.data'] = 'omsk'
            self.server.response['headers'] = [('X-Engine', 'PHP')]
            self.server.response['sleep'] = 2

            self.assertEqual(os.listdir(tmp_dir), [])
            try:
                grab.go(self.server.get_url())
            except GrabTimeoutError:
                pass

            self.assertEqual(sorted(os.listdir(tmp_dir)),
                             ['01.html', '01.log'])
            with open(os.path.join(tmp_dir, '01.log')) as inp:
                log_file_content = inp.read()
            self.assertTrue('user-agent: perl' in log_file_content.lower())