Esempio n. 1
0
 def test_update(self):
     h = Headers()
     h.update({
         'Content-Type': 'text/html',
         'X-Forwarded-For': ['ip1', 'ip2']
     })
     self.assertEqual(h.getlist('Content-Type'), ['text/html'])
     self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1', 'ip2'])
Esempio n. 2
0
 def test_update(self):
     h = Headers()
     h.update({
         "Content-Type": "text/html",
         "X-Forwarded-For": ["ip1", "ip2"]
     })
     self.assertEqual(h.getlist("Content-Type"), [b"text/html"])
     self.assertEqual(h.getlist("X-Forwarded-For"), [b"ip1", b"ip2"])
Esempio n. 3
0
 def _headers_from_twisted_response(response):
     headers = Headers()
     if response.length != UNKNOWN_LENGTH:
         headers[b'Content-Length'] = str(response.length).encode()
     headers.update(response.headers.getAllRawHeaders())
     return headers
Esempio n. 4
0
 def test_update(self):
     h = Headers()
     h.update(
         {'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']})
     self.assertEqual(h.getlist('Content-Type'), ['text/html'])
     self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1', 'ip2'])
Esempio n. 5
0
    def process_request(self, request, spider):
        """
        The request will be passed to the AutoExtract server only if the request
        is explicitly enabled with `{'autoextract': {'enabled': True}}` meta.
        The page type value must be also present, either in the
        AUTOEXTRACT_PAGE_TYPE option, or in `{'autoextract': {'pageType': '...'}}` meta.
        """
        if not self._is_enabled_for_request(request):
            return

        # If the request was already processed by AutoExtract
        if request.meta.get(AUTOEXTRACT_META_KEY):
            return

        if request.method != 'GET':
            raise AutoExtractError('Only GET requests are supported by AutoExtract')

        request.meta[AUTOEXTRACT_META_KEY] = {
            'original_url': request.url,
            'timing': {
                'start_ts': time.time()
            },
        }

        # Maybe over-write the page type value from the request
        page_type = self._check_page_type(request)
        logger.debug('Process AutoExtract request for %s URL %s',
                     page_type,
                     request,
                     extra={'spider': spider})

        # Define request timeout
        request.meta['download_timeout'] = self.timeout

        # Define concurrency settings
        self._set_download_slot(request, request.meta)

        payload = {'url': request.url, 'pageType': page_type}

        # Add the extra payload, if available
        extra_payload = self._get_meta_name(request, 'extra')
        if extra_payload:
            payload.update(extra_payload)

        headers = Headers({
            'Content-Type': 'application/json',
            'User-Agent': USER_AGENT,
            'Authorization': basic_auth_header(self._api_user, self._api_pass)
        })
        # Update the headers, if provided
        extra_headers = self._get_meta_name(request, 'headers')
        if extra_headers:
            headers.update(extra_headers)

        new_request = request.replace(
            url=self._api_url,
            method='POST',
            headers=headers,
            body=json.dumps([payload], sort_keys=True),
        )

        self.inc_metric('autoextract/request_count')
        return new_request