def page_requests(self, query, **kwargs): max_records = kwargs.get('data_source_results') recent_days = kwargs.get('recent_days') site = kwargs.get('site') if site: site = site.replace('*.', '') if site.startswith("*.") else site query = quote(query) + "+" + quote("site:") + quote(site) else: query = quote(query) if recent_days: if recent_days == 1: filters = 'ex1%3a"ez1"' elif recent_days == 7: filters = 'ex1%3a"ez2"' elif recent_days == 30: filters = 'ex1%3a"ez3"' else: raise ValueError('recent_days: {}'.format(recent_days)) raw_url = 'https://www.bing.com/search?q={}&filters={}'.format(query, filters) else: raw_url = 'https://www.bing.com/search?q={}'.format(query) if max_records is None: max_records = self.page_size for num in range(0, max_records, self.page_size): url = '{}&first={}'.format(raw_url, num + 1) yield HttpRequest(url)
def page_requests(self, query, **kwargs): """ Day: tbs=qdr:d Week: tbs=qdr:w Year: tbs=qdr:y +site%3A*.gov.cn """ max_records = kwargs.get('data_source_results') recent_days = kwargs.get('recent_days') site = kwargs.get('site') if site: query = quote(query) + "+" + quote("site:") + quote(site) else: query = quote(query) if recent_days: if recent_days == 1: tbs = 'qdr:d' elif recent_days == 7: tbs = 'qdr:w' elif recent_days == 30: tbs = 'qdr:m' else: raise ValueError('recent_days: {}'.format(recent_days)) raw_url = 'https://www.google.com.hk/search?q={}&tbs={}'.format(query, tbs) else: raw_url = 'https://www.google.com.hk/search?q={}'.format(query) if max_records is None: max_records = self.page_size for num in range(0, max_records, self.page_size): url = '{}&start={}'.format(raw_url, num + 1) yield HttpRequest(url)
def page_requests(self, query, **kwargs): max_records = kwargs.get('data_source_results') recent_days = kwargs.get('recent_days') site = kwargs.get('site') if site: query = query + " site:" + site if recent_days: if recent_days == 1: adv_t = 'd' elif recent_days == 7: adv_t = 'w' elif recent_days == 30: adv_t = 'm' else: raise ValueError('recent_days: {}'.format(recent_days)) raw_url = 'https://www.so.com/s?q={}&adv_t={}'.format( quote(query), adv_t) else: raw_url = 'https://www.so.com/s?q={}'.format(quote(query)) if max_records is None: max_records = self.page_size for num in range(0, max_records, self.page_size): url = '{}&pn={}'.format(raw_url, num // self.page_size + 1) yield HttpRequest(url)
def page_requests(self, query, **kwargs): max_records = kwargs.get('data_source_results') recent_days = kwargs.get('recent_days') site = kwargs.get('site') if max_records is None: max_records = self.page_size if site: query = query + " site:" + site if recent_days: today = datetime.now() if recent_days == 1: start = today + timedelta(days=-1) elif recent_days == 7: start = today + timedelta(days=-7) elif recent_days == 30: start = today + timedelta(days=-30) else: raise ValueError('recent_days: {}'.format(recent_days)) start, end = int(time.mktime(start.timetuple())), int( time.mktime(today.timetuple())) raw_url = 'http://www.baidu.com/s?wd={}&gpc=stf%3D{}%2C{}|stftype%3D1'.format( quote(query), start, end) else: raw_url = 'http://www.baidu.com/s?wd={}'.format(quote(query)) for num in range(0, max_records, self.page_size): url = '{}&pn={}'.format(raw_url, num) yield HttpRequest(url)
def page_requests(self, query, **kwargs): """ btf=d; btf=w; btf=m https://hk.search.yahoo.com/search?p=%E5%8C%97%E4%BA%AC+site%3A*.gov.cn """ max_records = kwargs.get('data_source_results') recent_days = kwargs.get('recent_days') site = kwargs.get('site') if site: site = site.replace('*.', '') if site.startswith("*.") else site query = quote(query) + "+" + quote("site:") + quote(site) else: query = quote(query) if recent_days: if recent_days == 1: btf = 'd' elif recent_days == 7: btf = 'w' elif recent_days == 30: btf = 'm' else: raise ValueError('recent_days: {}'.format(recent_days)) raw_url = 'https://hk.search.yahoo.com/search?q={}&btf={}'.format( query, btf) else: raw_url = 'https://hk.search.yahoo.com/search?q={}'.format(query) if max_records is None: max_records = self.page_size for num in range(0, max_records, self.page_size): url = '{}&b={}'.format(raw_url, num + 1) yield HttpRequest(url)
def page_requests(self, query, **kwargs): """ tsn=1&sourceid=inttime_day tsn=2&sourceid=inttime_week tsn=3&sourceid=inttime_month 北京+site%3A*.gov.cn """ max_records = kwargs.get('data_source_results') recent_days = kwargs.get('recent_days') site = kwargs.get('site') if site: query = query + " site:" + site else: query = query if recent_days: if recent_days == 1: tsn, sourceid = 1, "inttime_day" elif recent_days == 7: tsn, sourceid = 2, "inttime_week" elif recent_days == 30: tsn, sourceid = 3, "inttime_month" else: raise ValueError('recent_days: {}'.format(recent_days)) raw_url = 'https://www.sogou.com/web?query={}&tsn={}&sourceid={}'.format(quote(query), tsn, sourceid) else: raw_url = 'https://www.sogou.com/web?query={}'.format(quote(query)) if max_records is None: max_records = self.page_size for num in range(0, max_records, self.page_size): url = '{}&page={}&ie=utf8'.format(raw_url, num // self.page_size + 1) yield HttpRequest(url)
def page_requests(self, query, **kwargs): max_records = kwargs.get('data_source_results') if max_records is None: max_records = self.page_size for num in range(0, max_records, self.page_size): url = 'http://www.chinaso.com/search/pagesearch.htm?q={}&page={}&wd={}'.format( quote(query), num // self.page_size + 1, quote(query)) yield HttpRequest(url)
def page_requests(self, query, **kwargs): max_records = kwargs.get('data_source_results') if max_records is None: max_records = self.page_size for num in range(0, max_records, self.page_size): url = 'https://www.search.ask.com/web?q={}&page={}'.format( quote(query), num // self.page_size + 1) yield HttpRequest(url)
async def _get(): slave = self._slave_available(name) if slave is None: return try: real_url_req = HttpRequest(result['url'], allow_redirects=False) resp = await slave.fetch_url(real_url_req, name) location = resp['data'] if location is not None: result['url'] = urljoin(result['url'], location) except Exception as e: log.warning('Failed to get real location %s: %s', result['url'], e)
async def update_cookies(self): """ 避免被BAN,定时通过主页刷新Cookie """ while True: try: req = HttpRequest('http://www.baidu.com/') await self.extension.handle_request(req) resp = await self.downloader.fetch(req) self.cookies.update(self.get_cookies_in_response(resp)) except Exception as e: log.warning('Failed to update cookies: %s', e) finally: await asyncio.sleep(5 * 60)
async def update_cookies(self): while True: try: url = 'http://www.chinaso.com/search/pagesearch.htm?q={}'.format( quote('中国搜索')) try: req = HttpRequest(url, allow_redirects=False) await self.extension.handle_request(req) resp = await self.downloader.fetch(req) except HttpError as e: resp = e.response cookies = self.get_cookies_in_response(resp) self.cookies.update(cookies) except Exception as e: log.warning('Failed to update cookies: %s', e) finally: await asyncio.sleep(5 * 60)
def parse(self, response): selector = Selector(response.text) for quote in selector.css('div.quote'): text = quote.css('span.text')[0].text author = quote.css('small.author')[0].text author_url = quote.css('small+a')[0].attr('href') author_url = urljoin(str(response.url), author_url) tags = quote.css('div.tags a').text self.quotes.append( dict(text=text, tags=tags, author=author, author_url=author_url)) next_page = selector.css('li.next a') if len(next_page) > 0: next_page_url = urljoin(str(response.url), next_page[0].attr('href')) yield HttpRequest(next_page_url, callback=self.parse)
async def _fetch(self, request, name, rtype): body = pickle.dumps(request) timeout = self.config.get('timeout') req_headers = {'Content-Type': 'application/octet-stream'} timestamp = str(int(time.time())) nonce = str(random.randint(0, 1e8)) signature = self.sign(body, name, rtype, timestamp, nonce) url = '{}?name={}&rtype={}×tamp={}&nonce={}&signature={}'.format( self.api_url, name, rtype, timestamp, nonce, signature) req = HttpRequest(url, method='POST', headers=req_headers, body=body, timeout=timeout) resp = await self.downloader.fetch(req) real_resp = json.loads(resp.body) return real_resp
# coding=utf-8 from xpaw import make_requests, HttpRequest if __name__ == '__main__': requests = [ 'http://unknown', 'http://python.org', HttpRequest('http://python.org') ] results = make_requests(requests) print(results)
def start_requests(self): yield HttpRequest('http://quotes.toscrape.com/', callback=self.parse)
def start_requests(self): yield HttpRequest('http://www.baidu.com', headers=self.headers, callback=self.login)
def start_requests(self): for url in self.start_urls: yield HttpRequest(url, errback=self.handle_error)
def set_cookie_header(self, request: HttpRequest, cookies: SimpleCookie): if request.headers is None: request.headers = HttpHeaders() h = '; '.join('{}={}'.format(k, v.value) for k, v in cookies.items()) request.headers.add('Cookie', h)
def start_requests(self): yield HttpRequest("http://news.baidu.com/", callback=self.parse, dont_filter=True)
async def parse(self, response): selector = Selector(response.text) tags = selector.xpath("//div[contains(@class, 'tags-box')]//a").text self.log("Top ten tags: %s", tags) yield HttpRequest("http://quotes.toscrape.com/", callback=self.parse)