async def _download_request_page(self, request: Request, spider: Spider, page: Page) -> Response: self.stats.inc_value("pyppeteer/page_count") if self.navigation_timeout is not None: page.setDefaultNavigationTimeout(self.navigation_timeout) await page.setRequestInterception(True) page.on( "request", partial(_request_handler, scrapy_request=request, stats=self.stats)) page.on("response", partial(_response_handler, stats=self.stats)) start_time = time() response = await page.goto(request.url) page_coroutines = request.meta.get("pyppeteer_page_coroutines") or () if isinstance(page_coroutines, dict): page_coroutines = page_coroutines.values() for pc in page_coroutines: if isinstance(pc, PageCoroutine): method = getattr(page, pc.method) # set PageCoroutine timeout if self.page_coroutine_timeout is not None and not pc.kwargs.get( "timeout", None): pc.kwargs["timeout"] = self.page_coroutine_timeout if isinstance(pc, NavigationPageCoroutine): await asyncio.gather(page.waitForNavigation(), method(*pc.args, **pc.kwargs)) else: pc.result = await method(*pc.args, **pc.kwargs) body = (await page.content()).encode("utf8") request.meta["download_latency"] = time() - start_time callback = request.callback or spider.parse annotations = getattr(callback, "__annotations__", {}) for key, value in annotations.items(): if value is pyppeteer.page.Page: request.cb_kwargs[key] = page self.stats.inc_value("pyppeteer/page_count/injected_callback") break else: await page.close() self.stats.inc_value("pyppeteer/page_count/closed") headers = Headers(response.headers) headers.pop("Content-Encoding", None) respcls = responsetypes.from_args(headers=headers, url=page.url, body=body) return respcls( url=page.url, status=response.status, headers=headers, body=body, request=request, flags=["pyppeteer"], )
def request_album_detail(self, aid): headers = Headers() headers.setdefault("Content-Type", "application/x-www-form-urlencoded") url = "https://www.aiyinsitanfm.com/album/%s.html" % aid req = scrapy.Request(url=url, callback=self.parse_album_detail, method="GET", headers=headers) return req
def start_requests(self): headers = Headers() headers.setdefault("Content-Type", "application/x-www-form-urlencoded") sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') #data = {"album_id": "1241|13509"} data = "album_id=1029|1010080&order_type=1&page_num=1" yield scrapy.Request(url=self.startUrl, callback=self.parse, method="POST", headers=headers, body=data)
def request_album(self, did, page): headers = Headers() headers.setdefault("Content-Type", "application/x-www-form-urlencoded") url = "https://www.aiyinsitanfm.com/pcalbum_info/get_page_list" req = scrapy.Request(url=url, callback=self.parse_album, method="POST", headers=headers, body="album_id=%s&order_type=1&page_num=%s" % (did, page)) # print("=============--------album %s---------================" % did) return req
def request_classify(self, tid, page): headers = Headers() headers.setdefault("Content-Type", "application/x-www-form-urlencoded") url = "https://www.aiyinsitanfm.com/pcall_types/get_page_list" req = scrapy.Request(url=url, callback=self.parse_classify, method="POST", headers=headers, body="type_id=%s&sort_type=1&page_num=%s" % (tid, str(page))) # print("=============--------classify %s---------================" % tid) return req
def request_all_audio(self, audio_id): headers = Headers() headers.setdefault("Content-Type", "application/x-www-form-urlencoded") url = "https://www.aiyinsitanfm.com/pcplayer/get_all_list" req = scrapy.Request(url=url, callback=self.parse_audio, method="POST", headers=headers, body="audio_id=%s" % audio_id) print("=============--------audio %s---------================" % audio_id) return req
async def test_get_response_encoding(): assert (_get_response_encoding( headers=Headers({"content-type": "text/html; charset=UTF-8"}), body="", ) == "utf-8") assert (_get_response_encoding( headers=Headers(), body="""<!doctype html> <html lang="cn"> <head> <meta charset="gb2312"> </head> </html> """, ) == "gb18030") assert _get_response_encoding(headers=Headers(), body="") is None
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None): self.headers = Headers(headers or {}) self.status = int(status) self._set_body(body) self._set_url(url) self.request = request self.flags = [] if flags is None else list(flags)
def process_request(self, request, spider): if ('browser' not in request.meta or request.meta.get('_browser_processed')): return request.meta['_browser_processed'] = True browser_options = request.meta['browser'] endpoint = browser_options.setdefault( 'endpoint', self.default_endpoint, ) browser_base_url = browser_options.get( 'browser_url', self.browser_adapter_url, ) browser_url = urljoin(browser_base_url, endpoint) args = browser_options.setdefault('args', {}) args.setdefault('url', request.url) return request.replace( url='browser+' + browser_url, method='POST', body=json.dumps( args, ensure_ascii=False, sort_keys=True, ), headers=Headers({ 'Content-Type': 'application/json', }), )
def process_request(self, request, spider): request.headers['User-Agent'] = random.choice(self.useragent) header = { 'User-Agent': random.choice(self.useragent), 'Accept': '*/*', #'Accept-Language': 'zh-CN,en-US;q=0.7,en;q=0.3', #'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://douban.com/', 'Connection': 'keep-alive' } request.headers = Headers(header) request.cookies = { "__gads": "ID=0324bafb0f44eca5-226fa0ca44b90057:T=1616311402:RT=1616311402:S=ALNI_MaSfg6lhQif7aq5ex-5wojyq4TyQQ", "__utma": "30149280.92835787.1616310938.1616310938.1616330238.2", "__utmb": "30149280.16.10.1616330238", "__utmc": "30149280", "__utmz": "30149280.1616310938.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic", "_ga": "GA1.2.92835787.1616310938", "_gid": "GA1.2.682758769.1616330335", "ap_v": "0,6.0", "bid": "fGhRi940DQI", "gr_cs1_b78271da-cfd8-43ed-8efe-a1621f8f61f1": "user_id:0", "gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03": "b78271da-cfd8-43ed-8efe-a1621f8f61f1", "gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_b78271da-cfd8-43ed-8efe-a1621f8f61f1": "true", "gr_user_id": "498db6b9-0fb4-4d5a-84cd-6cb7783c9ece", "Hm_lpvt_6d4a8cfea88fa457c3127e14fb5fabc2": "1616331952", "Hm_lvt_6d4a8cfea88fa457c3127e14fb5fabc2": "1616330372,1616330460,1616331068,1616331952", "ll": "\"118371\"", "viewed": "\"1007305_35315150_35315153_35315159\"" }
def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None, flags=None, cb_kwargs=None): self._encoding = encoding # this one has to be set first self.method = str(method).upper() self._set_url(url) self._set_body(body) assert isinstance(priority, int), "Request priority not an integer: %r" % priority self.priority = priority if callback is not None and not callable(callback): raise TypeError('callback must be a callable, got %s' % type(callback).__name__) if errback is not None and not callable(errback): raise TypeError('errback must be a callable, got %s' % type(errback).__name__) self.callback = callback self.errback = errback self.cookies = cookies or {} self.headers = Headers(headers or {}, encoding=encoding) self.dont_filter = dont_filter self._meta = dict(meta) if meta else None self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None self.flags = [] if flags is None else list(flags)
def process_request(self, request, spider): splash_options = request.meta.get('splash') if not splash_options: return if request.method != 'GET': log.msg( "Only GET requests are supported by SplashMiddleware; %s will be handled without Splash" % request, logging.WARNING) return request for key, value in splash_options.items(): if key.lower() == 'timeout': request.meta['download_timeout'] = max( request.meta.get('download_timeout', 1e6), float(value) + self.SPLASH_EXTRA_TIMEOUT) if self.RESPECT_SLOTS: # Use the same download slot to (sort of) respect download # delays and concurrency options. request.meta['download_slot'] = self._get_slot_key(request) del request.meta['splash'] request.meta['_splash'] = True request.meta['_origin_url'] = request.url # FIXME: original HTTP headers are not respected. # To respect them changes to Splash are needed. request.headers = Headers({'Content-Type': 'application/json'}) request._set_url(self.splash_url(splash_options, request.url)) self.crawler.stats.inc_value('splash/request_count')
def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0.0, dont_filter=False, errback=None): self._encoding = encoding # this one has to be set first self.method = method.upper() self._set_url(url) self._set_body(body) self.priority = priority assert callback or not errback, "Cannot use errback without a callback" self.callback = callback self.errback = errback self.cookies = cookies or {} self.headers = Headers(headers or {}, encoding=encoding) self.dont_filter = dont_filter self._meta = dict(meta) if meta else None
def parse(self, response): f = open('data/yearmonth/l1.csv', 'w') headers = Headers({'Content-Type': 'application/json'}) for month in response.css("a.normtxt::attr('href')").extract(): if month.endswith(".cms"): t_url = urlparse.urljoin("http://timesofindia.indiatimes.com/", month) year = "%04d" % (int( (month.split("/")[2].split(",")[0].split("-")[1].strip()))) month = "%02d" % (int( month.split("/")[2].split(",")[1].split("-")[1].replace( ".cms", "").strip())) if int(year) >= 2017 and int(month) <= 2: f.write("%s\t%s\t%s\n" % (t_url, year, month)) body = json.dumps({ "url": t_url, "wait": 0.5 }, sort_keys=True) yield scrapy.Request(RENDER_HTML_URL, self.parse_l2, method="POST", body=body, headers=headers) f.close()
def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None, flags=None): self._encoding = encoding # this one has to be set first self.method = str(method).upper() self._set_url(url) self._set_body(body) assert isinstance( priority, int), "Request priority not an integer: %r" % priority self.priority = priority assert callback or not errback, "Cannot use errback without a callback" self.callback = callback self.errback = errback self.cookies = cookies or {} self.headers = Headers(headers or {}, encoding=encoding) self.dont_filter = dont_filter self._meta = dict(meta) if meta else None self.flags = [] if flags is None else list(flags)
def kayak_requests(self, dest): print('INSIDE KAYAK REQUESTS') kayak_url = 'https://www.kayak.com/flights/GPT-{}/2017-09-16/2017-09-17'.format(dest) print('kayak url = ' + kayak_url) body = json.dumps({"url": kayak_url, "wait": 1.5}, sort_keys=True) headers = Headers({'Content-Type': 'application/json'}) yield SplashRequest(RENDER_HTML_URL, self.parse_kayak, method='POST', body=body, headers=headers)
def start_requests(self): for url in self.start_urls: print(url) time.sleep(1) body = json.dumps({"url": url, "wait":2.5}, sort_keys=True) headers = Headers({'Content-Type': 'application/json'}) yield SplashRequest(RENDER_HTML_URL, self.parse, method='POST', body=body, headers=headers)
def get_headers(cls): return Headers({ # 'User-Agent': self._get_user_agent(), # 'Content-Type': 'application/json', # "Connection": "keep-alive", 'Accept': 'application/json; charset=utf-8', 'Host': 'm.weibo.cn', })
def get_headers(cls): return Headers({ # 'User-Agent': self._get_user_agent(), # 'Content-Type': 'application/json', # "Connection": "keep-alive", 'Accept': 'application/json', # 'Host': cls.BASE_URL, })
def start_requests(self): for url in self.start_urls: body = json.dumps({"url": url, "wait": 0.5, "js_enabled": False}) headers = Headers({'Content-Type': 'application/json'}) yield scrapy.Request(RENDER_HTML_URL, self.parse, method="POST", body=body, headers=headers)
def start_requests(self): self.setencoding() url = "https://www.dongqiudi.com/" headers = Headers({ 'User-Agent': 'Mozilla/5.0', 'Content-Type': 'application/json' }) yield Request(url, self.parse, headers=headers)
def get_status_headers(cls, uid): return Headers({ 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh;q=0.6', 'Accept': 'application/json, text/plain, */*', 'Host': 'm.weibo.cn', 'Referer': cls.get_m_weibo_home_url(uid), 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive', })
def retrieve_response(self, spider, request): data = self._read_data(spider, request) if data is None: return # not cached url = data['url'] status = data['status'] headers = Headers(data['headers']) body = data['html'] respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called request.headers = Headers(headers) return None
def retrieve_response(self, spider: TSpider, request: TRequest) -> Optional[TResponse]: data = self._read_data(spider, request) if data is None: return # not cached url = data["url"] status = data["status"] headers = Headers(data["headers"]) body = data["body"] respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def get_single_status_headers(cls, uid): """ head for single status of weibo :param uid: :return: """ return Headers({ 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh;q=0.6', 'Host': 'm.weibo.cn', 'Referer': cls.get_m_weibo_home_url(uid), })
def start_requests(self): for word, url in self.start_urls: print("INDEX URL : " + url) headers = Headers({'Content-Type': 'application/json'}) # request = scrapy.Request(url, self.parse, headers=headers, meta={ # 'splash': { # 'endpoint': 'render.html', # 'args': {'wait': 1.0} # } # }) request = scrapy.Request(url, self.parse, headers=headers) request.meta['word'] = word yield request
async def _download_request_with_page(self, request: Request, spider: Spider, page: Page) -> Response: start_time = time() response = await page.goto(request.url) page_coroutines = request.meta.get("playwright_page_coroutines") or () if isinstance(page_coroutines, dict): page_coroutines = page_coroutines.values() for pc in page_coroutines: if isinstance(pc, PageCoroutine): method = getattr(page, pc.method) pc.result = await method(*pc.args, **pc.kwargs) await page.wait_for_load_state( timeout=self.default_navigation_timeout) body = (await page.content()).encode("utf8") request.meta["download_latency"] = time() - start_time if request.meta.get("playwright_include_page"): request.meta["playwright_page"] = page else: await page.close() self.stats.inc_value("playwright/page_count/closed") headers = Headers(response.headers) headers.pop("Content-Encoding", None) respcls = responsetypes.from_args(headers=headers, url=page.url, body=body) return respcls( url=page.url, status=response.status, headers=headers, body=body, request=request, flags=["playwright"], )
def process_request(self, request, spider): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0', 'Host': 'weibo.cn', 'Cookie': 'xxx', "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive" } request.headers = Headers(headers)
def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None, flags=None): ## 编码 self._encoding = encoding # this one has to be set first ## 请求方法 self.method = str(method).upper() ## 设置 URL self._set_url(url) ## 设置 body self._set_body(body) assert isinstance( priority, int), "Request priority not an integer: %r" % priority ## 优先级 self.priority = priority if callback is not None and not callable(callback): raise TypeError('callback must be a callable, got %s' % type(callback).__name__) if errback is not None and not callable(errback): raise TypeError('errback must be a callable, got %s' % type(errback).__name__) assert callback or not errback, "Cannot use errback without a callback" ## 回调函数 self.callback = callback ## 异常回调函数 self.errback = errback ## cookies self.cookies = cookies or {} ## 构建请求头 self.headers = Headers(headers or {}, encoding=encoding) ## 是否需要过滤 self.dont_filter = dont_filter ## 附加信息 self._meta = dict(meta) if meta else None self.flags = [] if flags is None else list(flags)
def process_request(self, request, spider): if 'splash' not in request.meta: return if request.method not in {'GET', 'POST'}: logger.warn( "Currently only GET and POST requests are supported by " "SplashMiddleware; %(request)s will be handled without Splash", {'request': request}, extra={'spider': spider} ) return request if request.meta.get("_splash_processed"): # don't process the same request more than once return splash_options = request.meta['splash'] request.meta['_splash_processed'] = True slot_policy = splash_options.get('slot_policy', self.slot_policy) self._set_download_slot(request, request.meta, slot_policy) args = splash_options.setdefault('args', {}) if '_replaced_args' in splash_options: # restore arguments before sending request to the downloader load_args = {} save_args = [] local_arg_fingerprints = {} for name in splash_options['_replaced_args']: fp = args[name] # Use remote Splash argument cache: if Splash key # for a value is known then don't send the value to Splash; # if it is unknown then try to save the value on server using # ``save_args``. if fp in self._remote_keys: load_args[name] = self._remote_keys[fp] del args[name] else: save_args.append(name) args[name] = self._argument_values[fp] local_arg_fingerprints[name] = fp if load_args: args['load_args'] = load_args if save_args: args['save_args'] = save_args splash_options['_local_arg_fingerprints'] = local_arg_fingerprints del splash_options['_replaced_args'] # ?? args.setdefault('url', request.url) if request.method == 'POST': args.setdefault('http_method', request.method) # XXX: non-UTF8 bodies are not supported now args.setdefault('body', request.body.decode('utf8')) if not splash_options.get('dont_send_headers'): headers = scrapy_headers_to_unicode_dict(request.headers) if headers: args.setdefault('headers', headers) body = json.dumps(args, ensure_ascii=False, sort_keys=True, indent=4) # print(body) if 'timeout' in args: # User requested a Splash timeout explicitly. # # We can't catch a case when user requested `download_timeout` # explicitly because a default value for `download_timeout` # is set by DownloadTimeoutMiddleware. # # As user requested Splash timeout explicitly, we shouldn't change # it. Another reason not to change the requested Splash timeout is # because it may cause a validation error on the remote end. # # But we can change Scrapy `download_timeout`: increase # it when it's too small. Decreasing `download_timeout` is not # safe. timeout_requested = float(args['timeout']) timeout_expected = timeout_requested + self.splash_extra_timeout # no timeout means infinite timeout timeout_current = request.meta.get('download_timeout', 1e6) if timeout_expected > timeout_current: request.meta['download_timeout'] = timeout_expected endpoint = splash_options.setdefault('endpoint', self.default_endpoint) splash_base_url = splash_options.get('splash_url', self.splash_base_url) splash_url = urljoin(splash_base_url, endpoint) headers = Headers({'Content-Type': 'application/json'}) headers.update(splash_options.get('splash_headers', {})) new_request = request.replace( url=splash_url, method='POST', body=body, headers=headers, priority=request.priority + self.rescheduling_priority_adjust ) self.crawler.stats.inc_value('splash/%s/request_count' % endpoint) return new_request