def test_grab_parse_defensedxml(self): with temp_dir() as tmp_dir: injection_path = os.path.join(tmp_dir, 'injection') with open(injection_path, 'w') as out: out.write('Hey there!') # Prepare file:// URL valid for both linux and windows injection_url = 'file:///%s' % (injection_path.lstrip('/') .replace('\\', '/')) bad_xml = ( '<!DOCTYPE external [' '<!ENTITY ee SYSTEM "' + injection_url + '">' ']>' '<root>ⅇ</root>' ).encode() xml_file = os.path.join(tmp_dir, 'bad.xml') # On windows, use slashed instead of backslashes to avoid error: # Invalid file://hostname/, expected localhost or 127.0.0.1 or none if '\\' in xml_file: xml_file = xml_file.replace('\\', '/') with open(xml_file, 'wb') as out: out.write(bad_xml) grab = build_grab(content_type='xml') file_url = 'file://%s' % xml_file grab.go(file_url) self.assertRaises(EntitiesForbidden, grab.doc, '//title')
def test_transport_option_as_string_fake(self): with temp_dir() as dir_: sys.path.insert(0, dir_) with open(os.path.join(dir_, 'foo.py'), 'w') as out: out.write(FAKE_TRANSPORT_CODE) self.assert_transport_response('foo.FakeTransport', b'XYZ') sys.path.remove(dir_)
def test_body_inmemory_false(self): with temp_dir() as tmp_dir: grab = build_grab() grab.setup(body_inmemory=False) with self.assertRaises(GrabMisuseError): grab.go(self.server.get_url()) self.server.response['get.data'] = b'foo' grab = build_grab() grab.setup(body_inmemory=False) grab.setup(body_storage_dir=tmp_dir) grab.go(self.server.get_url()) self.assertTrue(os.path.exists(grab.doc.body_path)) self.assertTrue(tmp_dir in grab.doc.body_path) with open(grab.doc.body_path, 'rb') as inp: self.assertEqual(b'foo', inp.read()) # pylint: disable=protected-access self.assertEqual(grab.doc._bytes_body, None) # pylint: enable=protected-access old_path = grab.doc.body_path grab.go(self.server.get_url()) self.assertTrue(old_path != grab.doc.body_path) with temp_dir() as tmp_dir: self.server.response['get.data'] = 'foo' grab = build_grab() grab.setup(body_inmemory=False) grab.setup(body_storage_dir=tmp_dir) grab.setup(body_storage_filename='music.mp3') grab.go(self.server.get_url()) self.assertTrue(os.path.exists(grab.doc.body_path)) self.assertTrue(tmp_dir in grab.doc.body_path) with open(grab.doc.body_path, 'rb') as inp: self.assertEqual(b'foo', inp.read()) self.assertEqual(os.path.join(tmp_dir, 'music.mp3'), grab.doc.body_path) self.assertEqual(grab.doc.body, b'foo') # pylint: disable=protected-access self.assertEqual(grab.doc._bytes_body, None)
def test_save_hash(self): "Test `Response.save_hash` method." with temp_dir() as tmp_dir: with open(IMG_FILE, 'rb') as inp: img_data = inp.read() self.server.response['get.data'] = img_data grab = build_grab() grab.go(self.server.get_url()) path = grab.doc.save_hash(self.server.get_url(), tmp_dir) with open(os.path.join(tmp_dir, path), 'rb') as inp: test_data = inp.read() self.assertEqual(test_data, img_data)
def test_save(self): "Test `Response.save` method." with temp_dir() as tmp_dir: with open(IMG_FILE, 'rb') as inp: img_data = inp.read() tmp_file = os.path.join(tmp_dir, 'file.bin') self.server.response['get.data'] = img_data grab = build_grab() grab.go(self.server.get_url()) grab.doc.save(tmp_file) with open(tmp_file, 'rb') as inp: self.assertEqual(inp.read(), img_data)
def test_log_option(self): with temp_dir() as tmp_dir: reset_request_counter() log_file_path = os.path.join(tmp_dir, 'lograb.html') grab = build_grab() grab.setup(log_file=log_file_path) self.server.response['get.data'] = 'omsk' self.assertEqual(os.listdir(tmp_dir), []) grab.go(self.server.get_url()) self.assertEqual(os.listdir(tmp_dir), ['lograb.html']) with open(log_file_path) as inp: self.assertEqual(inp.read(), 'omsk')
def test_lxml_security_bug(self): with temp_dir() as tmp_dir: injection_path = os.path.join(tmp_dir, 'injection') with open(injection_path, 'w') as out: out.write('Hey there!') # Prepare file:// URL valid for both linux and windows injection_url = 'file:///%s' % (injection_path.lstrip('/').replace( '\\', '/')) bad_xml = ('<!DOCTYPE external [' '<!ENTITY ee SYSTEM "' + injection_url + '">' ']>' '<root>ⅇ</root>').encode() tree = parse(BytesIO(bad_xml)) self.assertEqual(tree.xpath('//root/text()')[0], 'Hey there!')
def test_log_dir_response_content(self): with temp_dir() as tmp_dir: reset_request_counter() grab = build_grab() grab.setup(log_dir=tmp_dir) self.server.response['get.data'] = 'omsk' self.server.response['headers'] = [('X-Engine', 'PHP')] self.assertEqual(os.listdir(tmp_dir), []) grab.go(self.server.get_url()) self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log']) with open(os.path.join(tmp_dir, '01.log')) as inp: log_file_content = inp.read() self.assertTrue('x-engine' in log_file_content.lower())
def test_log_dir_request_content_is_empty(self): with temp_dir() as tmp_dir: reset_request_counter() grab = build_grab() grab.setup(log_dir=tmp_dir) grab.setup(headers={'X-Name': 'spider'}, post='xxxPost') self.assertEqual(os.listdir(tmp_dir), []) grab.go(self.server.get_url()) self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log']) with open(os.path.join(tmp_dir, '01.log')) as inp: log_file_content = inp.read() self.assertFalse('X-Name' in log_file_content) self.assertFalse('xxxPost' in log_file_content)
def test_lxml_security_bug(self): with temp_dir() as tmp_dir: injection_path = os.path.join(tmp_dir, 'injection') with open(injection_path, 'w') as out: out.write('Hey there!') # Prepare file:// URL valid for both linux and windows injection_url = 'file:///%s' % (injection_path.lstrip('/') .replace('\\', '/')) bad_xml = ( '<!DOCTYPE external [' '<!ENTITY ee SYSTEM "' + injection_url + '">' ']>' '<root>ⅇ</root>' ).encode() tree = parse(BytesIO(bad_xml)) self.assertEqual(tree.xpath('//root/text()')[0], 'Hey there!')
def test_grab_parse_defensedxml(self): with temp_dir() as tmp_dir: injection_path = os.path.join(tmp_dir, 'injection') with open(injection_path, 'w') as out: out.write('Hey there!') # Prepare file:// URL valid for both linux and windows injection_url = 'file:///%s' % (injection_path.lstrip('/').replace( '\\', '/')) bad_xml = ('<!DOCTYPE external [' '<!ENTITY ee SYSTEM "' + injection_url + '">' ']>' '<root>ⅇ</root>').encode() xml_file = os.path.join(tmp_dir, 'bad.xml') with open(xml_file, 'wb') as out: out.write(bad_xml) grab = build_grab(content_type='xml') grab.go('file://%s' % xml_file) self.assertRaises(EntitiesForbidden, grab.doc, '//title')
def test_log_dir_option(self): with temp_dir() as tmp_dir: reset_request_counter() grab = build_grab() grab.setup(log_dir=tmp_dir) self.server.response_once['get.data'] = 'omsk1' self.server.response['get.data'] = 'omsk2' self.assertEqual(os.listdir(tmp_dir), []) grab.go(self.server.get_url()) grab.go(self.server.get_url()) self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log', '02.html', '02.log']) with open(os.path.join(tmp_dir, '01.html')) as inp: self.assertEqual(inp.read(), 'omsk1') with open(os.path.join(tmp_dir, '02.html')) as inp: self.assertEqual(inp.read(), 'omsk2')
def test_log_dir_request_content_headers_and_post(self): with temp_dir() as tmp_dir: reset_request_counter() grab = build_grab() grab.setup(log_dir=tmp_dir, debug=True) grab.setup(headers={'X-Name': 'spider'}, post={'xxx': 'Post'}) self.assertEqual(os.listdir(tmp_dir), []) grab.go(self.server.get_url()) self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log']) with open(os.path.join(tmp_dir, '01.log')) as inp: log_file_content = inp.read() #if not 'x-name' in log_file_content.lower(): # print('CONTENT OF 01.log:') # print(log_file_content) self.assertTrue('x-name' in log_file_content.lower()) self.assertTrue('xxx=post' in log_file_content.lower())
def test_log_dir_response_network_error(self): with temp_dir() as tmp_dir: reset_request_counter() grab = build_grab() grab.setup(log_dir=tmp_dir, timeout=1, user_agent='Perl', debug=True) self.server.response['get.data'] = 'omsk' self.server.response['headers'] = [('X-Engine', 'PHP')] self.server.response['sleep'] = 2 self.assertEqual(os.listdir(tmp_dir), []) try: grab.go(self.server.get_url()) except GrabTimeoutError: pass self.assertEqual(sorted(os.listdir(tmp_dir)), ['01.html', '01.log']) with open(os.path.join(tmp_dir, '01.log')) as inp: log_file_content = inp.read() self.assertTrue('user-agent: perl' in log_file_content.lower())
def test_grab_parse_defensedxml(self): with temp_dir() as tmp_dir: injection_path = os.path.join(tmp_dir, 'injection') with open(injection_path, 'w') as out: out.write('Hey there!') # Prepare file:// URL valid for both linux and windows injection_url = 'file:///%s' % (injection_path.lstrip('/').replace( '\\', '/')) bad_xml = ('<!DOCTYPE external [' '<!ENTITY ee SYSTEM "' + injection_url + '">' ']>' '<root>ⅇ</root>').encode() xml_file = os.path.join(tmp_dir, 'bad.xml') # On windows, use slashed instead of backslashes to avoid error: # Invalid file://hostname/, expected localhost or 127.0.0.1 or none if '\\' in xml_file: xml_file = xml_file.replace('\\', '/') with open(xml_file, 'wb') as out: out.write(bad_xml) grab = build_grab(content_type='xml') file_url = 'file://%s' % xml_file grab.go(file_url) self.assertRaises(EntitiesForbidden, grab.doc, '//title')
def test_log_dir_response_content_thread(self): with temp_dir() as tmp_dir: reset_request_counter() grab = build_grab() grab.setup(log_dir=tmp_dir) self.server.response['get.data'] = 'omsk' self.server.response['headers'] = [('X-Engine', 'PHP')] self.assertEqual(os.listdir(tmp_dir), []) def func(): grab.go(self.server.get_url()) thread = threading.Thread(target=func) thread.start() thread.join() files = os.listdir(tmp_dir) self.assertEqual(2, len([x for x in files if '01-thread' in x])) fname = [x for x in files if x.endswith('.log')][0] with open(os.path.join(tmp_dir, fname)) as inp: log_file_content = inp.read() self.assertTrue('x-engine' in log_file_content.lower())