def test_session(self): g = Grab(transport=GRAB_TRANSPORT) g.setup(reuse_cookies=True) SERVER.RESPONSE['cookies'] = {'foo': 'bar'} g.go(SERVER.BASE_URL) self.assertEqual(g.response.cookies['foo'], 'bar') g.go(SERVER.BASE_URL) self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar') g.go(SERVER.BASE_URL) self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar') g = Grab(transport=GRAB_TRANSPORT) g.setup(reuse_cookies=False) SERVER.RESPONSE['cookies'] = {'foo': 'baz'} g.go(SERVER.BASE_URL) self.assertEqual(g.response.cookies['foo'], 'baz') g.go(SERVER.BASE_URL) self.assertTrue('Cookie' not in SERVER.REQUEST['headers']) g = Grab(transport=GRAB_TRANSPORT) g.setup(reuse_cookies=True) SERVER.RESPONSE['cookies'] = {'foo': 'bar'} g.go(SERVER.BASE_URL) self.assertEqual(g.response.cookies['foo'], 'bar') g.clear_cookies() g.go(SERVER.BASE_URL) self.assertTrue('Cookie' not in SERVER.REQUEST['headers'])
def test_session(self): # Test that if Grab gets some cookies from the server # then it sends it back g = Grab(transport=GRAB_TRANSPORT) g.setup(reuse_cookies=True) SERVER.RESPONSE['cookies'] = {'foo': 'bar'} g.go(SERVER.BASE_URL) self.assertEqual(g.response.cookies['foo'], 'bar') g.go(SERVER.BASE_URL) self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar') g.go(SERVER.BASE_URL) self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar') # Test reuse_cookies=False g = Grab(transport=GRAB_TRANSPORT) g.setup(reuse_cookies=False) SERVER.RESPONSE['cookies'] = {'foo': 'baz'} g.go(SERVER.BASE_URL) self.assertEqual(g.response.cookies['foo'], 'baz') g.go(SERVER.BASE_URL) self.assertTrue(len(SERVER.REQUEST['cookies']) == 0) # Test something g = Grab(transport=GRAB_TRANSPORT) g.setup(reuse_cookies=True) SERVER.RESPONSE['cookies'] = {'foo': 'bar'} g.go(SERVER.BASE_URL) self.assertEqual(g.response.cookies['foo'], 'bar') g.clear_cookies() g.go(SERVER.BASE_URL) self.assertTrue(len(SERVER.REQUEST['cookies']) == 0)
def task_generator(self): logging.debug("*****execute******") with open('directories.csv', 'rb') as f: content = csv.reader(f) directories = list(content) # directories = ['google'] total = len(directories) logging.debug("*****{}******".format(total)) i = 100 total = 102 headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', } test_url = 'https://www.google.com' while (True): logging.debug("Index: {}".format(i)) if i >= total: break g = Grab() g.clear_cookies() g.setup(**config) g.setup(headers=headers) logging.debug("CONFIG : {}".format(g.config)) data = dict(slug=directories[i][0], ) logging.info(data) while True: try: print "------------------------" g.go(test_url) print g.doc.body print "++++++++++++++++++++++++" break except Exception as e: print "************************" logging.debug(e) time.sleep(1) yield Task('init', grab=g, data=data) time.sleep(5) i += 1
def search(query, grab=None, limit=None, per_page=None): if not grab: grab = Grab() stop = False count = 0 grab.clear_cookies() if grab.proxylist: grab.change_proxy() for page in xrange(1, 9999): if stop: break url = build_search_url(query, page, per_page=per_page) index_size = None grab = grab.go(url) #grab = google_request(url, grab=grab) count = 0 for item in parse_search_results(grab): yield item # {url, title, index_size} count += 1 if not count: stop = True if is_last_page(grab): logging.debug('Last page found') stop = True if limit is not None and count >= limit: logging.debug('Limit %d reached' % limit) stop = True grab.sleep(3, 5)
class ParserWithProxy(Spider): u"""Базовый класс парсера для работы с прокси""" USE_PROXY = True def __init__(self, country_code, *args, **kwargs): super(ParserWithProxy, self).__init__(*args, **kwargs) self.country = countries.get(alpha2=country_code) self.proxies = [] self.used_proxies = set() self.grab = None self.grab_use_count = None self.reinit_grab() self.setup_queue(getattr(config, 'QUEUE_BACKEND', 'memory')) if getattr(config, 'CACHE_ENABLED', False): self.setup_cache('mongo', getattr(config, 'CACHE_DATABASE', 'cache')) def check_grab(self, grab): return True def reinit_grab(self): if not self.grab: self.grab = Grab() self.grab_use_count = 0 while True: self.grab.clear_cookies() self.grab.setup(**self.get_next_proxy()) if self.check_grab(self.grab): break logger.info(u'Плохая прокси. Смена...') def get_grab(self): self.grab_use_count += 1 if self.grab_use_count > config.PROXY_USE_LIMIT: self.reinit_grab() return self.grab.clone() def get_next_proxy(self): u"""Получение следующей неиспользованной прокси""" if not self.USE_PROXY: return {} while not self.proxies: # получение проксей и фильтрация неспользованных self.proxies = get_proxy_list(self.country.alpha2, 100) self.proxies = filter( lambda proxy: tuple(proxy.values()) not in self.used_proxies, self.proxies ) if not self.proxies: logger.info(u'Кончились прокси, ожидание новых') sleep(10) else: break # возврат первой прокси proxy = self.proxies[0] self.used_proxies.add(tuple(proxy.values())) del self.proxies[0] return proxy
def id_for_answer(answer): """возвращает числовой идентификатор варианта ответа""" body = G.response.body ai = body.find(answer) fr = body[ai - 50: ai] id = fr[fr.find('PDI_answer') + 10:-2] return id def vote(c, l, j, g, o, p, k, m, h, f): """голосуем...""" d = id_for_answer(POSITION) url = 'http://polldaddy.com/vote.php?va={}&pt={}&r={}&p={}&a={}&o=&t={}&token={}'.format(o, g, j, c, d, p, f) G.go(url) try: print G.css_text('.poll-msg'), except: print 'No msg', print G.css_list('.votes')[POSITION_POS - 1].text #запускаем голосовалку for i in range(0, VOTES): G.clear_cookies() G.go('http://polldaddy.com/poll/6061575/') vote_call = G.css_list('.button-lrg')[0].attrib['onclick'] vote_call_args = vote_call[vote_call.find('(') + 1:vote_call.find(')')] vote_call_args = vote_call_args.strip().replace("'", '').split(',') vote(*vote_call_args) sleep(2)