Exemple #1
0
 def start_requests(self):
     yield Request(
         url="https://www.v2ex.com/?tab=hot",
         callback=self.parse_hot,
         recrawl=5,
         links_to_abs=True,
     )
Exemple #2
0
 async def _execute(self):
     if self.meta.get('enable'):
         interval_s = self.meta.get('interval', 60) * 60
         if 'browser' in self.meta:
             for url in self.meta.get('resource', []):
                 r = BrowserRequest(url,
                                    page_callback=self.meta['browser'],
                                    recrawl=interval_s)
                 yield r
         else:
             cb = None
             if 'css_divider' in self.meta:
                 css_divider = self.meta['css_divider']
                 if not css_divider in self.cb_table:
                     self.cb_table[css_divider] = Parser(
                         css_divider=css_divider,
                         item_type=ProxyParseItem).parse
                 cb = self.cb_table[css_divider]
             for url in self.meta.get('resource', []):
                 r = Request(url,
                             priority=random.randint(0, 100),
                             callback=self.crawler.parse,
                             recrawl=interval_s,
                             status_allowed=[])
                 if cb:
                     r.add_callback(cb)
                 yield r
     yield None
Exemple #3
0
 async def next_requests(self):
     await self.redis.delete(self.keys['tmp'])
     await self.redis.delete(self.pq_key)
     self.create_task(self.transfer_tmp())
     while 1:
         proxy = await self.redis.spop(self.keys['tmp'])
         if proxy:
             proxy = proxy.decode()
             old_score = await self.redis.zscore(self.keys['score'], proxy)
             req = Request(
                 self.url,
                 # this will allow any response's status
                 status_allowed=[],
                 # this will not retry the request.
                 ignore_exception=True,
                 dont_filter=True,
                 request_config={
                     'timeout': 40,
                     'proxy': 'http://' + proxy
                 },
                 meta={
                     'proxy': proxy,
                     'old_score': old_score
                 })
             await self.add_task(req)
         else:
             await asyncio.sleep(5)
Exemple #4
0
    async def start_requests(self):
        urls = []
        for web in WEBSITES:
            if web["enable"]:
                for url in web["resource"]:
                    urls.append(url)

        random.shuffle(urls)
        for url in urls:
            yield Request(url)
Exemple #5
0
 async def start_requests(self):
     yield Request(wh_top(1), callback=self.parse_top_search)
Exemple #6
0
 async def start_requests(self):
     for tid in range(MIN_TID, MAX_TID + 1):
         url = "http://api.bilibili.com/x/web-interface/newlist?rid={}&pn=1&ps=1".format(
             tid)
         yield Request(url, callback=self.parse_json, meta={"tid": tid})
Exemple #7
0
 async def start_requests(self):
     yield Request(url="http://quotes.toscrape.com/page/1/")
Exemple #8
0
 async def _execute(self):
     for url in self.urls:
         yield Request(url=url, callback=self.parse_search)
Exemple #9
0
 async def start_requests(self):
     yield Request("https://www.imdb.com/chart/moviemeter")
Exemple #10
0
 async def get_origin_ip(self):
     """Make request to httpbin to get original IP."""
     r = Request('http://httpbin.org/headers?show_env')
     resp = await r.fetch()
     self.origin_ip = resp.json['headers']['X-Real-Ip']