def test_useragent(self): grab = build_grab() # Null value activates default random user-agent # For some transports it just allow them to send default user-agent # like in Kit transport case grab = build_grab() grab.setup(user_agent=None) grab.go(self.server.get_url()) self.assertTrue(len(self.server.request['headers']) > 0) self.assertFalse( 'PycURL' in self.server.request['headers']['user-agent']) # By default user_agent is None => random user agent is generated grab = build_grab() grab.go(self.server.get_url()) self.assertTrue(len(self.server.request['headers']) > 0) self.assertFalse( 'PycURL' in self.server.request['headers']['user-agent']) # Simple case: setup user agent manually grab.setup(user_agent='foo') grab.go(self.server.get_url()) self.assertEqual(self.server.request['headers']['user-agent'], 'foo') with temp_file() as ua_file: # user agent from file should be loaded with open(ua_file, 'w') as out: out.write('GOD') grab.setup(user_agent=None, user_agent_file=ua_file) grab.go(self.server.get_url()) self.assertEqual(self.server.request['headers']['user-agent'], 'GOD') with temp_file() as ua_file: # random user agent from file should be loaded with open(ua_file, 'w') as out: out.write('GOD1\nGOD2') grab.setup(user_agent=None, user_agent_file=ua_file) grab.go(self.server.get_url()) self.assertTrue( self.server.request['headers']['user-agent'] in ('GOD1', 'GOD2')) agent = grab.config['user_agent'] # User-agent should not change grab.go(self.server.get_url()) self.assertEqual(self.server.request['headers']['user-agent'], agent) # User-agent should not change grab.go(self.server.get_url()) self.assertEqual(self.server.request['headers']['user-agent'], agent)
def test_load_dump(self): with temp_file() as tmp_file: g = build_grab() cookies = {'foo': 'bar', 'spam': 'ham'} g.setup(cookies=cookies) g.go(self.server.get_url()) g.dump_cookies(tmp_file) self.assertEqual(set(cookies.items()), set((x['name'], x['value']) for x in json.load(open(tmp_file)))) g = build_grab() cookies = {'foo': 'bar', 'spam': u'begemot'} g.setup(cookies=cookies) g.go(self.server.get_url()) g.dump_cookies(tmp_file) self.assertEqual(set(cookies.items()), set((x['name'], x['value']) for x in json.load(open(tmp_file)))) # Test load cookies g = build_grab() cookies = [{'name': 'foo', 'value': 'bar', 'domain': self.server.address}, {'name': 'spam', 'value': u'begemot', 'domain': self.server.address}] json.dump(cookies, open(tmp_file, 'w')) g.load_cookies(tmp_file) self.assertEqual(set(g.cookies.items()), set((x['name'], x['value']) for x in cookies))
def test_change_proxy(self): with temp_file() as tmp_file: grab = build_grab() grab.change_proxy() self.assertEqual(grab.config['proxy'], None) grab = build_grab() with open(tmp_file, 'w') as out: for num in six.moves.range(10): out.write('server-%d:777\n' % num) grab.load_proxylist(tmp_file, 'text_file', auto_init=False, auto_change=False) self.assertEqual(grab.config['proxy'], None) grab.load_proxylist(tmp_file, 'text_file', auto_init=False, auto_change=True) self.assertEqual(grab.config['proxy'], None) grab.load_proxylist(tmp_file, 'text_file', auto_init=True, auto_change=False) # pylint: disable=unsupported-membership-test self.assertTrue('server-' in grab.config['proxy'])
def test_load_proxylist(self): with temp_file() as tmp_file: content = "%s\n%s\n%s" % (PROXY1, PROXY2, PROXY3) open(tmp_file, "w").write(content) # By default auto_change is True g = build_grab() g.load_proxylist(tmp_file, "text_file") self.assertEqual(g.config["proxy_auto_change"], True) servers = set() for x in six.moves.range(10): g.go("http://yandex.ru") servers.add(g.config["proxy"]) self.assertTrue(len(servers) > 1) # Disable auto_change # By default auto_init is True g = build_grab() g.load_proxylist(tmp_file, "text_file", auto_change=False) self.assertEqual(g.config["proxy_auto_change"], False) servers = set() for x in six.moves.range(10): g.go("http://yandex.ru") servers.add(g.config["proxy"]) self.assertEqual(len(servers), 1) # Disable auto_change # Disable auto_init # Proxylist will not be used by default g = build_grab() g.load_proxylist(tmp_file, "text_file", auto_change=False, auto_init=False) self.assertEqual(g.config["proxy_auto_change"], False) g.go(self.server.get_url()) self.assertEqual(g.config["proxy"], None)
def test_change_proxy(self): with temp_file() as tmp_file: g = build_grab() g.change_proxy() self.assertEqual(g.config['proxy'], None) g = build_grab() with open(tmp_file, 'w') as out: for x in six.moves.range(10): out.write('server-%d:777\n' % x) g.load_proxylist(tmp_file, 'text_file', auto_init=False, auto_change=False) self.assertEqual(g.config['proxy'], None) g.load_proxylist(tmp_file, 'text_file', auto_init=False, auto_change=True) self.assertEqual(g.config['proxy'], None) g.load_proxylist(tmp_file, 'text_file', auto_init=True, auto_change=False) self.assertTrue('server-' in g.config['proxy'])
def test_cookiefile(self): with temp_file() as tmp_file: grab = build_grab() cookies = [{ 'name': 'spam', 'value': 'ham', 'domain': self.server.address }] json.dump(cookies, open(tmp_file, 'w')) # One cookie are sent in server reponse # Another cookies is passed via the `cookiefile` option self.server.response['cookies'] = {'godzilla': 'monkey'}.items() grab.setup(cookiefile=tmp_file, debug=True) grab.go(self.server.get_url()) self.assertEqual(self.server.request['cookies']['spam'].value, 'ham') # This is correct reslt of combining two cookies merged_cookies = [('godzilla', 'monkey'), ('spam', 'ham')] # grab.cookies should contains merged cookies self.assertEqual(set(merged_cookies), set(grab.cookies.items())) # `cookiefile` file should contains merged cookies self.assertEqual( set(merged_cookies), set((x['name'], x['value']) for x in json.load(open(tmp_file)))) # Just ensure it works grab.go(self.server.get_url())
def test_cookiefile_empty(self): with temp_file() as tmp_file: grab = build_grab() # Empty file should not raise Exception open(tmp_file, 'w').write('') grab.setup(cookiefile=tmp_file) grab.go(self.server.get_url())
def test_cookiefile_empty(self): with temp_file() as tmp_file: g = build_grab() # Empty file should not raise Exception open(tmp_file, "w").write("") g.setup(cookiefile=tmp_file) g.go(self.server.get_url())
def test_cookiefile(self): with temp_file() as tmp_file: g = build_grab() cookies = [{"name": "spam", "value": "ham", "domain": self.server.address}] json.dump(cookies, open(tmp_file, "w")) # One cookie are sent in server reponse # Another cookies is passed via the `cookiefile` option self.server.response["cookies"] = {"godzilla": "monkey"}.items() g.setup(cookiefile=tmp_file, debug=True) g.go(self.server.get_url()) self.assertEqual(self.server.request["cookies"]["spam"].value, "ham") # This is correct reslt of combining two cookies MERGED_COOKIES = [("godzilla", "monkey"), ("spam", "ham")] # g.cookies should contains merged cookies self.assertEqual(set(MERGED_COOKIES), set(g.cookies.items())) # `cookiefile` file should contains merged cookies self.assertEqual(set(MERGED_COOKIES), set((x["name"], x["value"]) for x in json.load(open(tmp_file)))) # Just ensure it works g.go(self.server.get_url())
def test_load_dump(self): with temp_file() as tmp_file: g = build_grab() cookies = {"foo": "bar", "spam": "ham"} g.setup(cookies=cookies) g.go(self.server.get_url()) g.dump_cookies(tmp_file) self.assertEqual(set(cookies.items()), set((x["name"], x["value"]) for x in json.load(open(tmp_file)))) g = build_grab() cookies = {"foo": "bar", "spam": u"begemot"} g.setup(cookies=cookies) g.go(self.server.get_url()) g.dump_cookies(tmp_file) self.assertEqual(set(cookies.items()), set((x["name"], x["value"]) for x in json.load(open(tmp_file)))) # Test load cookies g = build_grab() cookies = [ {"name": "foo", "value": "bar", "domain": self.server.address}, {"name": "spam", "value": u"begemot", "domain": self.server.address}, ] json.dump(cookies, open(tmp_file, "w")) g.load_cookies(tmp_file) self.assertEqual(set(g.cookies.items()), set((x["name"], x["value"]) for x in cookies))
def test_load_proxylist_text_file(self): with temp_file() as proxy_file: open(proxy_file, 'w').write('1.1.1.1:8080') grab = build_grab() grab.load_proxylist(proxy_file, 'text_file', auto_init=True, auto_change=False) self.assertEqual(grab.config['proxy'], '1.1.1.1:8080')
def test_get_next_proxy(self): with temp_file() as path: plist = ProxyList() self.generate_plist_file(path, 'foo:1\nbar:1') plist.load_file(path) self.assertEqual(plist.get_next_proxy().host, 'foo') self.assertEqual(plist.get_next_proxy().host, 'bar') self.assertEqual(plist.get_next_proxy().host, 'foo') plist.load_file(path) self.assertEqual(plist.get_next_proxy().host, 'foo')
def test_deprecated_setup_proxylist(self): with temp_file() as tmp_file: g = build_grab() open(tmp_file, 'w').write(PROXY1) g.load_proxylist(tmp_file, 'text_file') self.server.response['get.data'] = '123' g.change_proxy() g.go('http://yandex.ru') self.assertEqual(b'123', g.response.body) self.assertEqual('yandex.ru', self.server.request['headers']['host'])
def test_deprecated_setup_proxylist(self): with temp_file() as tmp_file: g = build_grab() open(tmp_file, "w").write(PROXY1) g.load_proxylist(tmp_file, "text_file") self.server.response["get.data"] = "123" g.change_proxy() g.go("http://yandex.ru") self.assertEqual(b"123", g.response.body) self.assertEqual("yandex.ru", self.server.request["headers"]["host"])
def test_deprecated_setup_proxylist(self): with temp_file() as tmp_file: proxy = '%s:%s' % (ADDRESS, self.server.port) grab = build_grab() open(tmp_file, 'w').write(proxy) grab.load_proxylist(tmp_file, 'text_file') self.server.response['get.data'] = '123' grab.change_proxy() grab.go('http://yandex.ru') self.assertEqual(b'123', grab.doc.body) self.assertEqual('yandex.ru', self.server.request['headers']['host'])
def test_cookie_merging_replace_with_cookies_option(self): with temp_file() as tmp_file: init_cookies = [{"name": "foo", "value": "bar", "domain": self.server.address}] json.dump(init_cookies, open(tmp_file, "w")) g = build_grab(debug=True) g.cookies.load_from_file(tmp_file) cookies = {"foo": "bar2", "sex": "male"} g.setup(cookies=cookies) g.go(self.server.get_url()) self.assertEqual(2, len(self.server.request["cookies"].items()))
def test_setup_proxylist(self): with temp_file() as proxy_file: content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3) open(proxy_file, 'w').write(content) # Simple test, one task bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist(proxy_file, 'text_file') bot.setup_queue() bot.add_task( Task('baz', grab=Grab(url='http://yandex.ru', debug=True))) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertEqual(1, len(set(bot.stat.collections['ports'])))
def test_setup_proxylist2(self): with temp_file() as proxy_file: content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3) open(proxy_file, 'w').write(content) # By default auto_change is True bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist(proxy_file, 'text_file') bot.setup_queue() for _ in six.moves.range(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertTrue(len(set(bot.stat.collections['ports'])) > 1)
def test_setup_proxylist(self): with temp_file() as proxy_file: content = '\n'.join(x['proxy'] for x in self.extra_servers.values()) open(proxy_file, 'w').write(content) # Simple test, one task bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist(proxy_file, 'text_file') bot.setup_queue() bot.add_task(Task('baz', grab=Grab(url='http://yandex.ru', debug=True))) bot.run() serv = [x['server'] for x in self.extra_servers.values() if x['server'].request['done']][0] self.assertEqual(serv.request['headers']['host'], 'yandex.ru') self.assertEqual(1, len(set(bot.stat.collections['ports'])))
def test_cookie_merging_replace_with_cookies_option(self): with temp_file() as tmp_file: init_cookies = [{'name': 'foo', 'value': 'bar', 'domain': self.server.address}] json.dump(init_cookies, open(tmp_file, 'w')) g = build_grab(debug=True) g.cookies.load_from_file(tmp_file) cookies = { 'foo': 'bar2', 'sex': 'male', } g.setup(cookies=cookies) g.go(self.server.get_url()) self.assertEqual(2, len(self.server.request['cookies'].items()))
def test_change_proxy(self): with temp_file() as tmp_file: g = build_grab() g.change_proxy() self.assertEqual(g.config["proxy"], None) g = build_grab() with open(tmp_file, "w") as out: for x in six.moves.range(10): out.write("server-%d:777\n" % x) g.load_proxylist(tmp_file, "text_file", auto_init=False, auto_change=False) self.assertEqual(g.config["proxy"], None) g.load_proxylist(tmp_file, "text_file", auto_init=False, auto_change=True) self.assertEqual(g.config["proxy"], None) g.load_proxylist(tmp_file, "text_file", auto_init=True, auto_change=False) self.assertTrue("server-" in g.config["proxy"])
def test_setup_proxylist2(self): with temp_file() as proxy_file: content = '\n'.join(x['proxy'] for x in self.extra_servers.values()) open(proxy_file, 'w').write(content) # By default auto_change is True bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist(proxy_file, 'text_file') bot.setup_queue() for _ in six.moves.range(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() servers = [x['server'] for x in self.extra_servers.values() if x['server'].request['done']] for serv in servers: self.assertEqual(serv.request['headers']['host'], 'yandex.ru') self.assertTrue(len(set(bot.stat.collections['ports'])) > 1)
def test_load_proxylist(self): with temp_file() as tmp_file: content = '\n'.join(x['proxy'] for x in self.extra_servers.values()) with open(tmp_file, 'w') as out: out.write(content) # By default auto_change is True grab = build_grab() grab.proxylist.load_file(tmp_file) self.assertEqual(grab.config['proxy_auto_change'], True) servers = set() for _ in six.moves.range(10): grab.go('http://yandex.ru') servers.add(grab.config['proxy']) self.assertTrue(len(servers) > 1) # Disable auto_change # Change proxy manually grab = build_grab() grab.proxylist.load_file(tmp_file) grab.setup(proxy_auto_change=False) grab.change_proxy() self.assertEqual(grab.config['proxy_auto_change'], False) # TODO: probably call proxy change manually servers = set() for _ in six.moves.range(10): grab.go('http://yandex.ru') servers.add(grab.config['proxy']) self.assertEqual(len(servers), 1) # Disable auto_change # By default auto_init is True # Proxylist will not be used by default grab = build_grab() grab.proxylist.load_file(tmp_file) grab.setup(proxy_auto_change=False) self.assertEqual(grab.config['proxy_auto_change'], False) grab.go(self.server.get_url()) self.assertEqual(grab.config['proxy'], None)
def test_setup_proxylist5(self): with temp_file() as proxy_file: content = '\n'.join(x['proxy'] for x in self.extra_servers.values()) open(proxy_file, 'w').write(content) # Disable auto_change # Disable auto_init # Proxylist will not be used by default bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist(proxy_file, 'text_file', auto_change=False, auto_init=False) bot.setup_queue() for _ in six.moves.range(10): bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(self.server.request['headers'].get('host'), '%s:%s' % (ADDRESS, self.server.port)) self.assertEqual(1, len(set(bot.stat.collections['ports']))) self.assertEqual(bot.stat.collections['ports'][0], self.server.port)
def test_upload_file(self): with temp_file() as file_path: g = self.prepare_form_grab() data = b'foo' with open(file_path, 'wb') as out: out.write(data) upload_data = UploadFile(file_path) g.doc.set_input('image', upload_data) g.doc.submit(make_request=False) post = dict(g.config['multipart_post']) self.assertTrue(isinstance(post['image'], UploadFile)) g.doc.submit() self.assertEqual(data, self.server.request['files']['image'][0]['body']) _, filename = os.path.split(file_path) self.assertEqual( filename, self.server.request['files']['image'][0]['filename']) self.assertEqual( 'application/octet-stream', self.server.request['files']['image'][0]['content_type'])
def test_upload_file_custom_filename(self): with temp_file() as file_path: g = self.prepare_form_grab() data = b'foo' with open(file_path, 'wb') as out: out.write(data) upload_data = UploadFile(file_path, filename='avatar.jpg') g.doc.set_input('image', upload_data) g.doc.submit(make_request=False) post = dict(g.config['multipart_post']) self.assertTrue(isinstance(post['image'], UploadFile)) g.doc.submit() self.assertEqual(data, self.server.request['files']['image'][0]['body']) self.assertEqual( 'avatar.jpg', self.server.request['files']['image'][0]['filename']) self.assertEqual( 'image/jpeg', self.server.request['files']['image'][0]['content_type'])
def test_load_proxylist(self): with temp_file() as tmp_file: content = '\n'.join(x['proxy'] for x in self.extra_servers.values()) open(tmp_file, 'w').write(content) # By default auto_change is True grab = build_grab() grab.load_proxylist(tmp_file, 'text_file') self.assertEqual(grab.config['proxy_auto_change'], True) servers = set() for _ in six.moves.range(10): grab.go('http://yandex.ru') servers.add(grab.config['proxy']) self.assertTrue(len(servers) > 1) # Disable auto_change # By default auto_init is True grab = build_grab() grab.load_proxylist(tmp_file, 'text_file', auto_change=False) self.assertEqual(grab.config['proxy_auto_change'], False) servers = set() for _ in six.moves.range(10): grab.go('http://yandex.ru') servers.add(grab.config['proxy']) self.assertEqual(len(servers), 1) # Disable auto_change # Disable auto_init # Proxylist will not be used by default grab = build_grab() grab.load_proxylist(tmp_file, 'text_file', auto_change=False, auto_init=False) self.assertEqual(grab.config['proxy_auto_change'], False) grab.go(self.server.get_url()) self.assertEqual(grab.config['proxy'], None)
def test_load_dump(self): with temp_file() as tmp_file: grab = build_grab() cookies = {'foo': 'bar', 'spam': 'ham'} grab.setup(cookies=cookies) grab.go(self.server.get_url()) grab.cookies.save_to_file(tmp_file) with open(tmp_file) as inp: self.assertEqual( set(cookies.items()), set((x['name'], x['value']) for x in json.load(inp))) grab = build_grab() cookies = {'foo': 'bar', 'spam': u'begemot'} grab.setup(cookies=cookies) grab.go(self.server.get_url()) grab.cookies.save_to_file(tmp_file) with open(tmp_file) as inp: self.assertEqual( set(cookies.items()), set((x['name'], x['value']) for x in json.load(inp))) # Test load cookies grab = build_grab() cookies = [{ 'name': 'foo', 'value': 'bar', 'domain': self.server.address }, { 'name': 'spam', 'value': u'begemot', 'domain': self.server.address }] with open(tmp_file, 'w') as out: json.dump(cookies, out) grab.cookies.load_from_file(tmp_file) self.assertEqual(set(grab.cookies.items()), set((x['name'], x['value']) for x in cookies))
def test_file_proxy_source(self): with temp_file() as path: plist = ProxyList() self.generate_plist_file(path) plist.load_file(path) self.assertEqual(2, plist.size())
def test_sigint(self): ''' Setup test server sleep 0.01 for each request Start spider in separate python shell (untill sigin or max 200 requests) Wait 1 sec (~100 requests, in reality less because of process start-up time) Send SIGINT to the process Check it returned with 13 or 139 codes 139 code means segfault (yeah...o_O) But as I see from logs it segfaults after successfully processing the SIGINT and this is all I need from this test ''' #logging.error('step-0') # pylint: disable=no-member self.server.response['sleep'] = 0.01 # pylint: enable=no-member with temp_file() as path: with open(path, 'w') as out: # pylint: disable=no-member out.write(self.script_tpl % ('', self.server.get_url())) # pylint: enable=no-member ret_codes = [] for _ in range(10): #logging.error('step-1') proc = Popen('python %s' % path, shell=True) #logging.error('step-2') parent = Process(proc.pid) #logging.error('step-3') time.sleep(1) #logging.error('killing children') for child in parent.children(): #logging.error('CHILD: %s', child.pid) # Sending multiple SIGINTs # because in very rare cases the only # sigint signals is ignored :-/ # do not send too fast for _ in range(1): try: #logging.error('sending sigint') child.send_signal(SIGNAL_INT) except NoSuchProcess: break else: time.sleep(1) if platform.system() == 'Darwin': # On OSX the Popen(shell=True) spawns only # one process, no child #logging.error('Killing parent') #logging.error('PARENT: %s', parent.pid) # Sending multiple SIGINTs # because in very rare cases the only # sigint signals is ignored :-/ # do not send too fast for _ in range(1): try: #logging.error('sending sigint') parent.send_signal(SIGNAL_INT) except NoSuchProcess: break else: time.sleep(1) #logging.error('step-4') ret = None for _ in range(20): #print('before proc-poll-%d' % step) ret = proc.poll() if ret is not None: break time.sleep(0.1) else: #logging.error('CHILD PROCESS DID NOT RETURN') #raise Exception('Child process did not return') # try to clean processes try: for child in parent.children(): child.send_signal(signal.SIGTERM) except NoSuchProcess: pass time.sleep(0.5) try: parent.send_signal(signal.SIGTERM) except NoSuchProcess: pass #logging.error('step-5') # FIXME: find out the reasonf of segfault # the 130 signal means the program was terminated by ctrl-c #print('RET CODE: %s' % ret) ret_codes.append(ret) # Could fail in 10% (1 of 10) # pylint: disable=no-member self.assertTrue(sum(1 for x in ret_codes if x in (13, 130, 139)) >= 9)
def test_download(self): with temp_file() as save_file: grab = build_grab() self.server.response['get.data'] = 'FOO' length = grab.download(self.server.get_url(), save_file) self.assertEqual(3, length)