def start_requests(self): yield Request( url="https://www.v2ex.com/?tab=hot", callback=self.parse_hot, recrawl=5, links_to_abs=True, )
async def _execute(self): if self.meta.get('enable'): interval_s = self.meta.get('interval', 60) * 60 if 'browser' in self.meta: for url in self.meta.get('resource', []): r = BrowserRequest(url, page_callback=self.meta['browser'], recrawl=interval_s) yield r else: cb = None if 'css_divider' in self.meta: css_divider = self.meta['css_divider'] if not css_divider in self.cb_table: self.cb_table[css_divider] = Parser( css_divider=css_divider, item_type=ProxyParseItem).parse cb = self.cb_table[css_divider] for url in self.meta.get('resource', []): r = Request(url, priority=random.randint(0, 100), callback=self.crawler.parse, recrawl=interval_s, status_allowed=[]) if cb: r.add_callback(cb) yield r yield None
async def next_requests(self): await self.redis.delete(self.keys['tmp']) await self.redis.delete(self.pq_key) self.create_task(self.transfer_tmp()) while 1: proxy = await self.redis.spop(self.keys['tmp']) if proxy: proxy = proxy.decode() old_score = await self.redis.zscore(self.keys['score'], proxy) req = Request( self.url, # this will allow any response's status status_allowed=[], # this will not retry the request. ignore_exception=True, dont_filter=True, request_config={ 'timeout': 40, 'proxy': 'http://' + proxy }, meta={ 'proxy': proxy, 'old_score': old_score }) await self.add_task(req) else: await asyncio.sleep(5)
async def start_requests(self): urls = [] for web in WEBSITES: if web["enable"]: for url in web["resource"]: urls.append(url) random.shuffle(urls) for url in urls: yield Request(url)
async def start_requests(self): yield Request(wh_top(1), callback=self.parse_top_search)
async def start_requests(self): for tid in range(MIN_TID, MAX_TID + 1): url = "http://api.bilibili.com/x/web-interface/newlist?rid={}&pn=1&ps=1".format( tid) yield Request(url, callback=self.parse_json, meta={"tid": tid})
async def start_requests(self): yield Request(url="http://quotes.toscrape.com/page/1/")
async def _execute(self): for url in self.urls: yield Request(url=url, callback=self.parse_search)
async def start_requests(self): yield Request("https://www.imdb.com/chart/moviemeter")
async def get_origin_ip(self): """Make request to httpbin to get original IP.""" r = Request('http://httpbin.org/headers?show_env') resp = await r.fetch() self.origin_ip = resp.json['headers']['X-Real-Ip']