コード例 #1
0
    async def _download_request_page(self, request: Request, spider: Spider,
                                     page: Page) -> Response:
        self.stats.inc_value("pyppeteer/page_count")
        if self.navigation_timeout is not None:
            page.setDefaultNavigationTimeout(self.navigation_timeout)
        await page.setRequestInterception(True)
        page.on(
            "request",
            partial(_request_handler, scrapy_request=request,
                    stats=self.stats))
        page.on("response", partial(_response_handler, stats=self.stats))

        start_time = time()
        response = await page.goto(request.url)

        page_coroutines = request.meta.get("pyppeteer_page_coroutines") or ()
        if isinstance(page_coroutines, dict):
            page_coroutines = page_coroutines.values()
        for pc in page_coroutines:
            if isinstance(pc, PageCoroutine):
                method = getattr(page, pc.method)

                # set PageCoroutine timeout
                if self.page_coroutine_timeout is not None and not pc.kwargs.get(
                        "timeout", None):
                    pc.kwargs["timeout"] = self.page_coroutine_timeout

                if isinstance(pc, NavigationPageCoroutine):
                    await asyncio.gather(page.waitForNavigation(),
                                         method(*pc.args, **pc.kwargs))
                else:
                    pc.result = await method(*pc.args, **pc.kwargs)

        body = (await page.content()).encode("utf8")
        request.meta["download_latency"] = time() - start_time

        callback = request.callback or spider.parse
        annotations = getattr(callback, "__annotations__", {})
        for key, value in annotations.items():
            if value is pyppeteer.page.Page:
                request.cb_kwargs[key] = page
                self.stats.inc_value("pyppeteer/page_count/injected_callback")
                break
        else:
            await page.close()
            self.stats.inc_value("pyppeteer/page_count/closed")

        headers = Headers(response.headers)
        headers.pop("Content-Encoding", None)
        respcls = responsetypes.from_args(headers=headers,
                                          url=page.url,
                                          body=body)
        return respcls(
            url=page.url,
            status=response.status,
            headers=headers,
            body=body,
            request=request,
            flags=["pyppeteer"],
        )
コード例 #2
0
 def request_album_detail(self, aid):
     headers = Headers()
     headers.setdefault("Content-Type", "application/x-www-form-urlencoded")
     url = "https://www.aiyinsitanfm.com/album/%s.html" % aid
     req = scrapy.Request(url=url,
                          callback=self.parse_album_detail,
                          method="GET",
                          headers=headers)
     return req
コード例 #3
0
 def start_requests(self):
     headers = Headers()
     headers.setdefault("Content-Type", "application/x-www-form-urlencoded")
     sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
     #data = {"album_id": "1241|13509"}
     data = "album_id=1029|1010080&order_type=1&page_num=1"
     yield scrapy.Request(url=self.startUrl,
                          callback=self.parse,
                          method="POST",
                          headers=headers,
                          body=data)
コード例 #4
0
 def request_album(self, did, page):
     headers = Headers()
     headers.setdefault("Content-Type", "application/x-www-form-urlencoded")
     url = "https://www.aiyinsitanfm.com/pcalbum_info/get_page_list"
     req = scrapy.Request(url=url,
                          callback=self.parse_album,
                          method="POST",
                          headers=headers,
                          body="album_id=%s&order_type=1&page_num=%s" %
                          (did, page))
     # print("=============--------album %s---------================" % did)
     return req
コード例 #5
0
 def request_classify(self, tid, page):
     headers = Headers()
     headers.setdefault("Content-Type", "application/x-www-form-urlencoded")
     url = "https://www.aiyinsitanfm.com/pcall_types/get_page_list"
     req = scrapy.Request(url=url,
                          callback=self.parse_classify,
                          method="POST",
                          headers=headers,
                          body="type_id=%s&sort_type=1&page_num=%s" %
                          (tid, str(page)))
     # print("=============--------classify %s---------================" % tid)
     return req
コード例 #6
0
    def request_all_audio(self, audio_id):
        headers = Headers()
        headers.setdefault("Content-Type", "application/x-www-form-urlencoded")
        url = "https://www.aiyinsitanfm.com/pcplayer/get_all_list"
        req = scrapy.Request(url=url,
                             callback=self.parse_audio,
                             method="POST",
                             headers=headers,
                             body="audio_id=%s" % audio_id)

        print("=============--------audio %s---------================" %
              audio_id)
        return req
コード例 #7
0
async def test_get_response_encoding():
    assert (_get_response_encoding(
        headers=Headers({"content-type": "text/html; charset=UTF-8"}),
        body="",
    ) == "utf-8")
    assert (_get_response_encoding(
        headers=Headers(),
        body="""<!doctype html>
<html lang="cn">
<head>
  <meta charset="gb2312">
</head>
</html>
""",
    ) == "gb18030")
    assert _get_response_encoding(headers=Headers(), body="") is None
コード例 #8
0
ファイル: __init__.py プロジェクト: pabitra10/company-crawl
 def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None):
     self.headers = Headers(headers or {})
     self.status = int(status)
     self._set_body(body)
     self._set_url(url)
     self.request = request
     self.flags = [] if flags is None else list(flags)
コード例 #9
0
    def process_request(self, request, spider):
        if ('browser' not in request.meta
                or request.meta.get('_browser_processed')):
            return

        request.meta['_browser_processed'] = True
        browser_options = request.meta['browser']

        endpoint = browser_options.setdefault(
            'endpoint',
            self.default_endpoint,
        )
        browser_base_url = browser_options.get(
            'browser_url',
            self.browser_adapter_url,
        )
        browser_url = urljoin(browser_base_url, endpoint)

        args = browser_options.setdefault('args', {})
        args.setdefault('url', request.url)

        return request.replace(
            url='browser+' + browser_url,
            method='POST',
            body=json.dumps(
                args,
                ensure_ascii=False,
                sort_keys=True,
            ),
            headers=Headers({
                'Content-Type': 'application/json',
            }),
        )
コード例 #10
0
ファイル: middlewares.py プロジェクト: AororaSSS/homeworks
 def process_request(self, request, spider):
     request.headers['User-Agent'] = random.choice(self.useragent)
     header = {
         'User-Agent': random.choice(self.useragent),
         'Accept': '*/*',
         #'Accept-Language': 'zh-CN,en-US;q=0.7,en;q=0.3',
         #'Accept-Encoding': 'gzip, deflate, br',
         'Referer': 'https://douban.com/',
         'Connection': 'keep-alive'
         }
     request.headers = Headers(header)
     
     request.cookies = {
         "__gads": "ID=0324bafb0f44eca5-226fa0ca44b90057:T=1616311402:RT=1616311402:S=ALNI_MaSfg6lhQif7aq5ex-5wojyq4TyQQ",
         "__utma": "30149280.92835787.1616310938.1616310938.1616330238.2",
         "__utmb": "30149280.16.10.1616330238",
         "__utmc": "30149280",
         "__utmz": "30149280.1616310938.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic",
         "_ga": "GA1.2.92835787.1616310938",
         "_gid": "GA1.2.682758769.1616330335",
         "ap_v": "0,6.0",
         "bid": "fGhRi940DQI",
         "gr_cs1_b78271da-cfd8-43ed-8efe-a1621f8f61f1": "user_id:0",
         "gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03": "b78271da-cfd8-43ed-8efe-a1621f8f61f1",
         "gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_b78271da-cfd8-43ed-8efe-a1621f8f61f1": "true",
         "gr_user_id": "498db6b9-0fb4-4d5a-84cd-6cb7783c9ece",
         "Hm_lpvt_6d4a8cfea88fa457c3127e14fb5fabc2": "1616331952",
         "Hm_lvt_6d4a8cfea88fa457c3127e14fb5fabc2": "1616330372,1616330460,1616331068,1616331952",
         "ll": "\"118371\"",
         "viewed": "\"1007305_35315150_35315153_35315159\""
     }
コード例 #11
0
    def __init__(self, url, callback=None, method='GET', headers=None, body=None,
                 cookies=None, meta=None, encoding='utf-8', priority=0,
                 dont_filter=False, errback=None, flags=None, cb_kwargs=None):

        self._encoding = encoding  # this one has to be set first
        self.method = str(method).upper()
        self._set_url(url)
        self._set_body(body)
        assert isinstance(priority, int), "Request priority not an integer: %r" % priority
        self.priority = priority

        if callback is not None and not callable(callback):
            raise TypeError('callback must be a callable, got %s' % type(callback).__name__)
        if errback is not None and not callable(errback):
            raise TypeError('errback must be a callable, got %s' % type(errback).__name__)
        self.callback = callback
        self.errback = errback

        self.cookies = cookies or {}
        self.headers = Headers(headers or {}, encoding=encoding)
        self.dont_filter = dont_filter

        self._meta = dict(meta) if meta else None
        self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
        self.flags = [] if flags is None else list(flags)
コード例 #12
0
    def process_request(self, request, spider):
        splash_options = request.meta.get('splash')
        if not splash_options:
            return

        if request.method != 'GET':
            log.msg(
                "Only GET requests are supported by SplashMiddleware; %s will be handled without Splash"
                % request, logging.WARNING)
            return request

        for key, value in splash_options.items():
            if key.lower() == 'timeout':
                request.meta['download_timeout'] = max(
                    request.meta.get('download_timeout', 1e6),
                    float(value) + self.SPLASH_EXTRA_TIMEOUT)

        if self.RESPECT_SLOTS:
            # Use the same download slot to (sort of) respect download
            # delays and concurrency options.
            request.meta['download_slot'] = self._get_slot_key(request)

        del request.meta['splash']
        request.meta['_splash'] = True
        request.meta['_origin_url'] = request.url

        # FIXME: original HTTP headers are not respected.
        # To respect them changes to Splash are needed.
        request.headers = Headers({'Content-Type': 'application/json'})
        request._set_url(self.splash_url(splash_options, request.url))

        self.crawler.stats.inc_value('splash/request_count')
コード例 #13
0
    def __init__(self,
                 url,
                 callback=None,
                 method='GET',
                 headers=None,
                 body=None,
                 cookies=None,
                 meta=None,
                 encoding='utf-8',
                 priority=0.0,
                 dont_filter=False,
                 errback=None):

        self._encoding = encoding  # this one has to be set first
        self.method = method.upper()
        self._set_url(url)
        self._set_body(body)
        self.priority = priority

        assert callback or not errback, "Cannot use errback without a callback"
        self.callback = callback
        self.errback = errback

        self.cookies = cookies or {}
        self.headers = Headers(headers or {}, encoding=encoding)
        self.dont_filter = dont_filter

        self._meta = dict(meta) if meta else None
コード例 #14
0
 def parse(self, response):
     f = open('data/yearmonth/l1.csv', 'w')
     headers = Headers({'Content-Type': 'application/json'})
     for month in response.css("a.normtxt::attr('href')").extract():
         if month.endswith(".cms"):
             t_url = urlparse.urljoin("http://timesofindia.indiatimes.com/",
                                      month)
             year = "%04d" % (int(
                 (month.split("/")[2].split(",")[0].split("-")[1].strip())))
             month = "%02d" % (int(
                 month.split("/")[2].split(",")[1].split("-")[1].replace(
                     ".cms", "").strip()))
             if int(year) >= 2017 and int(month) <= 2:
                 f.write("%s\t%s\t%s\n" % (t_url, year, month))
                 body = json.dumps({
                     "url": t_url,
                     "wait": 0.5
                 },
                                   sort_keys=True)
                 yield scrapy.Request(RENDER_HTML_URL,
                                      self.parse_l2,
                                      method="POST",
                                      body=body,
                                      headers=headers)
     f.close()
コード例 #15
0
    def __init__(self,
                 url,
                 callback=None,
                 method='GET',
                 headers=None,
                 body=None,
                 cookies=None,
                 meta=None,
                 encoding='utf-8',
                 priority=0,
                 dont_filter=False,
                 errback=None,
                 flags=None):

        self._encoding = encoding  # this one has to be set first
        self.method = str(method).upper()
        self._set_url(url)
        self._set_body(body)
        assert isinstance(
            priority, int), "Request priority not an integer: %r" % priority
        self.priority = priority

        assert callback or not errback, "Cannot use errback without a callback"
        self.callback = callback
        self.errback = errback

        self.cookies = cookies or {}
        self.headers = Headers(headers or {}, encoding=encoding)
        self.dont_filter = dont_filter

        self._meta = dict(meta) if meta else None
        self.flags = [] if flags is None else list(flags)
コード例 #16
0
 def kayak_requests(self, dest):
     print('INSIDE KAYAK REQUESTS')
     kayak_url = 'https://www.kayak.com/flights/GPT-{}/2017-09-16/2017-09-17'.format(dest)
     print('kayak url = ' + kayak_url)
     body = json.dumps({"url": kayak_url, "wait": 1.5}, sort_keys=True)
     headers = Headers({'Content-Type': 'application/json'})
     yield SplashRequest(RENDER_HTML_URL, self.parse_kayak, method='POST',
                         body=body, headers=headers)
コード例 #17
0
 def start_requests(self):
     for url in self.start_urls:
         print(url)
         time.sleep(1)
         body = json.dumps({"url": url, "wait":2.5}, sort_keys=True)
         headers = Headers({'Content-Type': 'application/json'})
         yield SplashRequest(RENDER_HTML_URL, self.parse, method='POST',
                             body=body, headers=headers)
コード例 #18
0
ファイル: utils.py プロジェクト: wusir2001/galaxy
 def get_headers(cls):
     return Headers({
         # 'User-Agent': self._get_user_agent(),
         # 'Content-Type': 'application/json',
         # "Connection": "keep-alive",
         'Accept': 'application/json; charset=utf-8',
         'Host': 'm.weibo.cn',
     })
コード例 #19
0
 def get_headers(cls):
     return Headers({
         # 'User-Agent': self._get_user_agent(),
         # 'Content-Type': 'application/json',
         # "Connection": "keep-alive",
         'Accept': 'application/json',
         # 'Host': cls.BASE_URL,
     })
コード例 #20
0
ファイル: url_grabber.py プロジェクト: sherkt1/disqus-crawler
 def start_requests(self):
     for url in self.start_urls:
         body = json.dumps({"url": url, "wait": 0.5, "js_enabled": False})
         headers = Headers({'Content-Type': 'application/json'})
         yield scrapy.Request(RENDER_HTML_URL,
                              self.parse,
                              method="POST",
                              body=body,
                              headers=headers)
コード例 #21
0
    def start_requests(self):
        self.setencoding()
        url = "https://www.dongqiudi.com/"
        headers = Headers({
            'User-Agent': 'Mozilla/5.0',
            'Content-Type': 'application/json'
        })

        yield Request(url, self.parse, headers=headers)
コード例 #22
0
ファイル: utils.py プロジェクト: wusir2001/galaxy
 def get_status_headers(cls, uid):
     return Headers({
         'Accept-Encoding': 'gzip, deflate, sdch',
         'Accept-Language': 'en-US,en;q=0.8,zh;q=0.6',
         'Accept': 'application/json, text/plain, */*',
         'Host': 'm.weibo.cn',
         'Referer': cls.get_m_weibo_home_url(uid),
         'X-Requested-With': 'XMLHttpRequest',
         'Connection': 'keep-alive',
     })
コード例 #23
0
ファイル: mongodb.py プロジェクト: yashodhank/invana-bot
 def retrieve_response(self, spider, request):
     data = self._read_data(spider, request)
     if data is None:
         return  # not cached
     url = data['url']
     status = data['status']
     headers = Headers(data['headers'])
     body = data['html']
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
コード例 #24
0
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        request.headers = Headers(headers)
        return None
コード例 #25
0
 def retrieve_response(self, spider: TSpider,
                       request: TRequest) -> Optional[TResponse]:
     data = self._read_data(spider, request)
     if data is None:
         return  # not cached
     url = data["url"]
     status = data["status"]
     headers = Headers(data["headers"])
     body = data["body"]
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
コード例 #26
0
ファイル: utils.py プロジェクト: wusir2001/galaxy
 def get_single_status_headers(cls, uid):
     """
     head for single status of weibo
     :param uid:
     :return:
     """
     return Headers({
         'Accept-Encoding': 'gzip, deflate, sdch',
         'Accept-Language': 'en-US,en;q=0.8,zh;q=0.6',
         'Host': 'm.weibo.cn',
         'Referer': cls.get_m_weibo_home_url(uid),
     })
コード例 #27
0
 def start_requests(self):
     for word, url in self.start_urls:
         print("INDEX URL : " + url)
         headers = Headers({'Content-Type': 'application/json'})
         # request = scrapy.Request(url, self.parse, headers=headers, meta={
         #     'splash': {
         #         'endpoint': 'render.html',
         #         'args': {'wait': 1.0}
         #     }
         # })
         request = scrapy.Request(url, self.parse, headers=headers)
         request.meta['word'] = word
         yield request
コード例 #28
0
    async def _download_request_with_page(self, request: Request,
                                          spider: Spider,
                                          page: Page) -> Response:
        start_time = time()
        response = await page.goto(request.url)

        page_coroutines = request.meta.get("playwright_page_coroutines") or ()
        if isinstance(page_coroutines, dict):
            page_coroutines = page_coroutines.values()
        for pc in page_coroutines:
            if isinstance(pc, PageCoroutine):
                method = getattr(page, pc.method)
                pc.result = await method(*pc.args, **pc.kwargs)
                await page.wait_for_load_state(
                    timeout=self.default_navigation_timeout)

        body = (await page.content()).encode("utf8")
        request.meta["download_latency"] = time() - start_time

        if request.meta.get("playwright_include_page"):
            request.meta["playwright_page"] = page
        else:
            await page.close()
            self.stats.inc_value("playwright/page_count/closed")

        headers = Headers(response.headers)
        headers.pop("Content-Encoding", None)
        respcls = responsetypes.from_args(headers=headers,
                                          url=page.url,
                                          body=body)
        return respcls(
            url=page.url,
            status=response.status,
            headers=headers,
            body=body,
            request=request,
            flags=["playwright"],
        )
コード例 #29
0
 def process_request(self, request, spider):
     headers = {
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
         'Host': 'weibo.cn',
         'Cookie': 'xxx',
         "Accept-Language":
         "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
         "Accept-Encoding": "gzip, deflate, br",
         "Connection": "keep-alive"
     }
     request.headers = Headers(headers)
コード例 #30
0
    def __init__(self,
                 url,
                 callback=None,
                 method='GET',
                 headers=None,
                 body=None,
                 cookies=None,
                 meta=None,
                 encoding='utf-8',
                 priority=0,
                 dont_filter=False,
                 errback=None,
                 flags=None):

        ## 编码
        self._encoding = encoding  # this one has to be set first
        ## 请求方法
        self.method = str(method).upper()
        ## 设置 URL
        self._set_url(url)
        ## 设置 body
        self._set_body(body)
        assert isinstance(
            priority, int), "Request priority not an integer: %r" % priority
        ## 优先级
        self.priority = priority

        if callback is not None and not callable(callback):
            raise TypeError('callback must be a callable, got %s' %
                            type(callback).__name__)
        if errback is not None and not callable(errback):
            raise TypeError('errback must be a callable, got %s' %
                            type(errback).__name__)
        assert callback or not errback, "Cannot use errback without a callback"
        ## 回调函数
        self.callback = callback
        ## 异常回调函数
        self.errback = errback

        ## cookies
        self.cookies = cookies or {}
        ## 构建请求头
        self.headers = Headers(headers or {}, encoding=encoding)
        ## 是否需要过滤
        self.dont_filter = dont_filter

        ## 附加信息
        self._meta = dict(meta) if meta else None
        self.flags = [] if flags is None else list(flags)
コード例 #31
0
    def process_request(self, request, spider):
        if 'splash' not in request.meta:
            return

        if request.method not in {'GET', 'POST'}:
            logger.warn(
                "Currently only GET and POST requests are supported by "
                "SplashMiddleware; %(request)s will be handled without Splash",
                {'request': request},
                extra={'spider': spider}
            )
            return request

        if request.meta.get("_splash_processed"):
            # don't process the same request more than once
            return

        splash_options = request.meta['splash']
        request.meta['_splash_processed'] = True

        slot_policy = splash_options.get('slot_policy', self.slot_policy)
        self._set_download_slot(request, request.meta, slot_policy)

        args = splash_options.setdefault('args', {})

        if '_replaced_args' in splash_options:
            # restore arguments before sending request to the downloader
            load_args = {}
            save_args = []
            local_arg_fingerprints = {}
            for name in splash_options['_replaced_args']:
                fp = args[name]
                # Use remote Splash argument cache: if Splash key
                # for a value is known then don't send the value to Splash;
                # if it is unknown then try to save the value on server using
                # ``save_args``.
                if fp in self._remote_keys:
                    load_args[name] = self._remote_keys[fp]
                    del args[name]
                else:
                    save_args.append(name)
                    args[name] = self._argument_values[fp]

                local_arg_fingerprints[name] = fp

            if load_args:
                args['load_args'] = load_args
            if save_args:
                args['save_args'] = save_args
            splash_options['_local_arg_fingerprints'] = local_arg_fingerprints

            del splash_options['_replaced_args']  # ??

        args.setdefault('url', request.url)
        if request.method == 'POST':
            args.setdefault('http_method', request.method)
            # XXX: non-UTF8 bodies are not supported now
            args.setdefault('body', request.body.decode('utf8'))

        if not splash_options.get('dont_send_headers'):
            headers = scrapy_headers_to_unicode_dict(request.headers)
            if headers:
                args.setdefault('headers', headers)

        body = json.dumps(args, ensure_ascii=False, sort_keys=True, indent=4)
        # print(body)

        if 'timeout' in args:
            # User requested a Splash timeout explicitly.
            #
            # We can't catch a case when user requested `download_timeout`
            # explicitly because a default value for `download_timeout`
            # is set by DownloadTimeoutMiddleware.
            #
            # As user requested Splash timeout explicitly, we shouldn't change
            # it. Another reason not to change the requested Splash timeout is
            # because it may cause a validation error on the remote end.
            #
            # But we can change Scrapy `download_timeout`: increase
            # it when it's too small. Decreasing `download_timeout` is not
            # safe.

            timeout_requested = float(args['timeout'])
            timeout_expected = timeout_requested + self.splash_extra_timeout

            # no timeout means infinite timeout
            timeout_current = request.meta.get('download_timeout', 1e6)

            if timeout_expected > timeout_current:
                request.meta['download_timeout'] = timeout_expected

        endpoint = splash_options.setdefault('endpoint', self.default_endpoint)
        splash_base_url = splash_options.get('splash_url', self.splash_base_url)
        splash_url = urljoin(splash_base_url, endpoint)

        headers = Headers({'Content-Type': 'application/json'})
        headers.update(splash_options.get('splash_headers', {}))
        new_request = request.replace(
            url=splash_url,
            method='POST',
            body=body,
            headers=headers,
            priority=request.priority + self.rescheduling_priority_adjust
        )
        self.crawler.stats.inc_value('splash/%s/request_count' % endpoint)
        return new_request