def task_generator(self): ua = UserAgent() grab = Grab() grab.load_proxylist( PROXY_PATH, 'text_file', proxy_type='http', auto_init=False, auto_change=True ) for link in VOCABULARY: url = link['url'] pages = xrange(1, link['pages']) cat = link['cat'] for page in pages: grab.change_proxy() grab.setup( url=url % page, proxy_userpwd=CREDENTIALS, hammer_mode=True, hammer_timeouts=HTM, user_agent=ua.random, reuse_cookies=False ) yield Task('link_on_page', grab=grab, cat=cat)
def task_generator(self): ua = UserAgent() grab = Grab() grab.load_proxylist(self.PROXY_PATH, 'text_file', proxy_type='http', auto_init=False, auto_change=True) for link in VOCABULARY: url = link['url'] part_url = LOCALITY[self.city_pars] pages = xrange(1, link['pages']) cat = Category.objects.get(id=link['cat'][1]) moroz = link['cat'][0] city = City.objects.get(id=LOCALITY[self.city_pars][1]) for page in pages: print 'number_of_pages=', page grab.change_proxy() grab.setup( url=url % (part_url[0][0], part_url[0][1], part_url[0][2], page), proxy_userpwd=self.CREDENTIALS, hammer_mode=True, hammer_timeouts=HTM, user_agent=ua.random, reuse_cookies=False) # check_proxies_for_slando(self, grab=grab, ua=ua.random, url=url % (LOCALITY[self.city_pars][0], page)) print 'proxy before go of page list ', grab.config['proxy'] yield Task('link_on_page', delay=4, grab=grab, cat=cat, city=city, moroz=moroz)
def test_deprecated_setup_proxylist(self): g = Grab(transport=GRAB_TRANSPORT) open(TMP_FILE, 'w').write(PROXY1) g.load_proxylist(TMP_FILE, 'text_file') SERVER.RESPONSE['get'] = '123' g.change_proxy() g.go('http://yandex.ru') self.assertEqual('123', g.response.body) self.assertEqual('yandex.ru', SERVER.REQUEST['headers']['host'])
def test_deprecated_setup_proxylist(self): g = Grab(transport=GRAB_TRANSPORT) open('/tmp/__proxy.txt', 'w').write(PROXY1) g.load_proxylist('/tmp/__proxy.txt', 'text_file') SERVER.RESPONSE['get'] = '123' g.change_proxy() g.go('http://yandex.ru') self.assertEqual('123', g.response.body) self.assertEqual('yandex.ru', SERVER.REQUEST['headers']['host'])
def task_generator(self): ua = UserAgent() grab = Grab(timeout=30) grab.load_proxylist('proxy_http_auth.txt', 'text_file', proxy_type='http', auto_init=False, auto_change=True) # grab.config["thread_number"] = 40 for link in VOCABULARY: url = link['url'] pages = xrange(1, link['pages']) cat = link['cat'] for page in pages: grab.change_proxy() grab.setup(url=url % page, proxy_userpwd=CREDENTIALS, hammer_mode=True, hammer_timeouts=((2, 5), (10, 15), (20, 30)), user_agent=ua.random, reuse_cookies=True) yield Task('link_on_page', grab=grab, cat=cat)
def task_generator(self): ua = UserAgent() grab = Grab(timeout=30) grab.load_proxylist('proxy_http_auth.txt', 'text_file', proxy_type='http', auto_init=False, auto_change=True) while True: dig = random.randint(111, 999) grab.change_proxy() grab.setup( url='http://zipexpert.com.ua/catalog/?q=%s&s=' % dig, # url='http://good-service.com.ua/content/zapchasti-dlya-stiralnykh-mashin-v-kharkove-i-po-vsei-ukraine-optom-i-v-roznitsu', proxy_userpwd=CREDENTIALS, hammer_mode=True, hammer_timeouts=((2, 5), (10, 15), (20, 30)), user_agent=ua.random, reuse_cookies=False) yield Task('link_on_page', grab=grab)
def search(query, grab=None, limit=None, per_page=None): if not grab: grab = Grab() stop = False count = 0 grab.clear_cookies() if grab.proxylist: grab.change_proxy() for page in xrange(1, 9999): if stop: break url = build_search_url(query, page, per_page=per_page) index_size = None grab = grab.go(url) #grab = google_request(url, grab=grab) count = 0 for item in parse_search_results(grab): yield item # {url, title, index_size} count += 1 if not count: stop = True if is_last_page(grab): logging.debug('Last page found') stop = True if limit is not None and count >= limit: logging.debug('Limit %d reached' % limit) stop = True grab.sleep(3, 5)
class Avito(): PAGETRY = 3 SLEEP = 1 def __init__(self, proxy=None): self.PAGETRY, rc = startup.CFG.value('pagetry', Avito.PAGETRY).toInt() self.SLEEP, rc = startup.CFG.value('sleep', Avito.SLEEP).toInt() self.g = Grab() self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10))) if __name__ == '__main__': self.g.setup(log_dir='dump') if proxy: self.g.load_proxylist(proxy, 'text_file', 'http', auto_init=True, auto_change=False) self.tess = Tesseract() self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE) def _go3(self, url, tag): c = self.PAGETRY while c: try: self.g.change_proxy() self.g.go(url) self.g.assert_substring(u'content="51a6d66a02fb23c7"') break except: log.exception('%s left %i', tag, c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception('%s error' % tag) def get_links(self, url): self._go3(url, 'start page') c = 999 while True: links = self.g.doc.select('//h3[@class="title"]/a/@href') if not links: raise Exception('no links') for link in links: c -= 1 if not c: return yield urljoin(url, link.text()) next = self.g.doc.select('//a[@class="next"]/@href') if not next: log.debug('last page?') break QtCore.QThread.sleep(self.SLEEP) nurl = urljoin(url, next.text()) log.debug('open next page %s', nurl) self._go3(nurl, 'next page') def get_photos(self, photos): g = self.g.clone() datas = [] for url in photos: if not url.startswith('http:'): url = 'http:' + url c = self.PAGETRY while c: try: rc = g.go(url) g.assert_substring('JFIF', byte=True) datas.append(rc.body) break except: log.exception('get_item left %i', c) c -= 1 g.change_proxy() QtCore.QThread.sleep(self.SLEEP) else: raise Exception('get photo error') return datas def get_item(self, url): g = self.g.clone() c = self.PAGETRY while c: try: g.change_proxy() g.go(url) g.assert_substring( u'content="499bdc75d3636c55"') # avitos ya id break except: log.exception('get_item left %i', c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception('get item error') doc = g.doc title = doc.select('//h1[@itemprop="name"]').text() gallery = doc.select('//div[@class="gallery-item"]') photos = [ s.text() for s in gallery.select('.//a[@class="gallery-link"]/@href') ] if not photos: egg = doc.select( '//img[contains(@class,"j-zoom-gallery-init")]/@src') if egg: photos.append(egg.text()) item = doc.select('//div[@itemprop="offers"]') #price = item.select('.//strong[@itemprop="price"]').text() price = item.select('.//span[@itemprop="price"]').text() name = item.select('.//strong[@itemprop="name"]').text() try: town = item.select( './/span[@id="toggle_map"]/span[@itemprop="name"]').text() except: log.warning('xpath town not found, try another way') town = item.select( './/div[@id="map"]/span[@itemprop="name"]').text() #desc = item.select('.//div[@id="desc_text"]').text() desc = doc.select( "//div[contains(@class,\"description-text\")]").text() #<span id="item_id">348967351</span> item_id = doc.select('//span[@id="item_id"]').text() _url = g.rex_text("avito.item.url = '([^>]+?)'") _phone = g.rex_text("avito.item.phone = '([^>]+?)'") loc = {'item_phone': _phone, 'item': {'id': item_id, 'url': _url}} log.debug('jslock enter <--') with PyV8.JSContext(loc) as ctx: ctx.enter() ctx.eval('''function phoneDemixer(key){var pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}} return r;}''') #http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4 egg = ctx.eval( "'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)") #egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)") log.debug('js rc %s', egg) ctx.leave() log.debug('jslock leave -->') phone = '' c = self.PAGETRY while c: log.debug('read phone image') try: rc = g.go(urljoin(url, egg)) img = Image.open(StringIO(rc.body)) phone = self.tess.from_image(img, basewidth=300, whitelist='0123456789-') break except: g.change_proxy() log.exception('get_phone left %i', c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: log.debug('get phone error') return dict(item=item_id, title=title, photos=photos, price=price, name=name, town=town, desc=desc, phone=phone)
class Avito: PAGETRY = 3 SLEEP = 1 def __init__(self, proxy=None): self.PAGETRY, rc = startup.CFG.value("pagetry", Avito.PAGETRY).toInt() self.SLEEP, rc = startup.CFG.value("sleep", Avito.SLEEP).toInt() self.g = Grab() self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10))) if __name__ == "__main__": self.g.setup(log_dir="dump") if proxy: self.g.load_proxylist(proxy, "text_file", "http", auto_init=True, auto_change=False) self.tess = Tesseract() self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE) def _go3(self, url, tag): c = self.PAGETRY while c: try: self.g.change_proxy() self.g.go(url) self.g.assert_substring(u'content="51a6d66a02fb23c7"') break except: log.exception("%s left %i", tag, c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception("%s error" % tag) def get_links(self, url): self._go3(url, "start page") c = 999 while True: links = self.g.doc.select('//h3[@class="title"]/a/@href') if not links: raise Exception("no links") for link in links: c -= 1 if not c: return yield urljoin(url, link.text()) next = self.g.doc.select('//a[@class="next"]/@href') if not next: log.debug("last page?") break QtCore.QThread.sleep(self.SLEEP) nurl = urljoin(url, next.text()) log.debug("open next page %s", nurl) self._go3(nurl, "next page") def get_photos(self, photos): g = self.g.clone() datas = [] for url in photos: if not url.startswith("http:"): url = "http:" + url c = self.PAGETRY while c: try: rc = g.go(url) g.assert_substring("JFIF", byte=True) datas.append(rc.body) break except: log.exception("get_item left %i", c) c -= 1 g.change_proxy() QtCore.QThread.sleep(self.SLEEP) else: raise Exception("get photo error") return datas def get_item(self, url): g = self.g.clone() c = self.PAGETRY while c: try: g.change_proxy() g.go(url) g.assert_substring(u'content="499bdc75d3636c55"') # avitos ya id break except: log.exception("get_item left %i", c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception("get item error") doc = g.doc title = doc.select('//h1[@itemprop="name"]').text() gallery = doc.select('//div[@class="gallery-item"]') photos = [s.text() for s in gallery.select('.//a[@class="gallery-link"]/@href')] if not photos: egg = doc.select('//img[contains(@class,"j-zoom-gallery-init")]/@src') if egg: photos.append(egg.text()) item = doc.select('//div[@itemprop="offers"]') # price = item.select('.//strong[@itemprop="price"]').text() price = item.select('.//span[@itemprop="price"]').text() name = item.select('.//strong[@itemprop="name"]').text() try: town = item.select('.//span[@id="toggle_map"]/span[@itemprop="name"]').text() except: log.warning("xpath town not found, try another way") town = item.select('.//div[@id="map"]/span[@itemprop="name"]').text() # desc = item.select('.//div[@id="desc_text"]').text() desc = doc.select('//div[contains(@class,"description-text")]').text() # <span id="item_id">348967351</span> item_id = doc.select('//span[@id="item_id"]').text() _url = g.rex_text("avito.item.url = '([^>]+?)'") _phone = g.rex_text("avito.item.phone = '([^>]+?)'") loc = {"item_phone": _phone, "item": {"id": item_id, "url": _url}} log.debug("jslock enter <--") with PyV8.JSContext(loc) as ctx: ctx.enter() ctx.eval( """function phoneDemixer(key){var pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}} return r;}""" ) # http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4 egg = ctx.eval("'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)") # egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)") log.debug("js rc %s", egg) ctx.leave() log.debug("jslock leave -->") phone = "" c = self.PAGETRY while c: log.debug("read phone image") try: rc = g.go(urljoin(url, egg)) img = Image.open(StringIO(rc.body)) phone = self.tess.from_image(img, basewidth=300, whitelist="0123456789-") break except: g.change_proxy() log.exception("get_phone left %i", c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: log.debug("get phone error") return dict(item=item_id, title=title, photos=photos, price=price, name=name, town=town, desc=desc, phone=phone)