Exemple #1
0
 def get_netscape_cookie_spec(self, cookie, request_host):
     # FIXME: Now cookie.domain could not be None
     # request_host is not needed anymore
     host = make_unicode(cookie.domain) or request_host
     if cookie.get_nonstandard_attr('HttpOnly'):
         host = '#HttpOnly_' + host
     items = [
         host,
         u'TRUE',
         make_unicode(cookie.path),
         u'TRUE' if cookie.secure else u'FALSE',
         make_unicode(
             str(cookie.expires if cookie.expires else YEAR_2030_EPOCH_TIME)
         ),
         make_unicode(cookie.name),
         make_unicode(cookie.value),
     ]
     return (u'\t'.join(items)).encode('utf-8')
Exemple #2
0
 def get_netscape_cookie_spec(self, cookie, request_host):
     # FIXME: Now cookie.domain could not be None
     # request_host is not needed anymore
     host = make_unicode(cookie.domain) or request_host
     if cookie.get_nonstandard_attr('HttpOnly'):
         host = '#HttpOnly_' + host
     items = [
         host,
         u'TRUE',
         make_unicode(cookie.path),
         u'TRUE' if cookie.secure else u'FALSE',
         make_unicode(str(
             cookie.expires if cookie.expires
             else YEAR_2030_EPOCH_TIME
         )),
         make_unicode(cookie.name),
         make_unicode(cookie.value),
     ]
     return (u'\t'.join(items)).encode('utf-8')
Exemple #3
0
    def execute_task_handler(self, res, handler):
        """
        Apply `handler` function to the network result.

        If network result is failed then submit task again
        to the network task queue.
        """

        try:
            handler_name = handler.__name__
        except AttributeError:
            handler_name = 'NONE'

        if (res['task'].get('raw') or (
            res['ok'] and self.valid_response_code(res['grab'].response.code,
                                                   res['task']))):
            try:
                with self.stat.log_time('response_handler'):
                    with self.stat.log_time('response_handler.%s' % handler_name):
                        result = handler(res['grab'], res['task'])
                        if result is None:
                            pass
                        else:
                            for item in result:
                                self.process_handler_result(item, res['task'])
            except NoDataHandler as ex:
                raise
            except Exception as ex:
                self.process_handler_error(handler_name, ex, res['task'])
            else:
                self.stat.inc('spider:task-%s-ok' % res['task'].name)
        else:
            # Log the error
            if res['ok']:
                msg = res['emsg'] = 'HTTP %s' % res['grab'].response.code
            else:
                msg = res['emsg']

            self.stat.inc('spider:network-error-%s' %
                          make_str(res['emsg'][:20], errors='ignore'))
            logger.error(u'Network error: %s' %
                         make_unicode(msg, errors='ignore'))

            # Try to repeat the same network query
            if self.network_try_limit > 0:
                task = res['task']
                task.refresh_cache = True
                # Should use task.grab_config or backup of grab_config
                task.setup_grab_config(res['grab_config_backup'])
                self.add_task(task)
Exemple #4
0
    def execute_task_handler(self, res, handler):
        """
        Apply `handler` function to the network result.

        If network result is failed then submit task again
        to the network task queue.
        """

        try:
            handler_name = handler.__name__
        except AttributeError:
            handler_name = 'NONE'

        if (res['task'].get('raw') or (res['ok'] and self.valid_response_code(
                res['grab'].response.code, res['task']))):
            try:
                with self.save_timer('response_handler'):
                    with self.save_timer('response_handler.%s' % handler_name):
                        result = handler(res['grab'], res['task'])
                        if result is None:
                            pass
                        else:
                            for item in result:
                                self.process_handler_result(item, res['task'])
            except NoDataHandler as ex:
                raise
            except Exception as ex:
                self.process_handler_error(handler_name, ex, res['task'])
            else:
                self.inc_count('task-%s-ok' % res['task'].name)
        else:
            # Log the error
            if res['ok']:
                msg = res['emsg'] = 'HTTP %s' % res['grab'].response.code
            else:
                msg = res['emsg']

            self.inc_count('network-error-%s' %
                           make_str(res['emsg'][:20], errors='ignore'))
            logger.error(u'Network error: %s' %
                         make_unicode(msg, errors='ignore'))

            # Try to repeat the same network query
            if self.network_try_limit > 0:
                task = res['task']
                task.refresh_cache = True
                # Should use task.grab_config or backup of grab_config
                task.setup_grab_config(res['grab_config_backup'])
                self.add_task(task)
Exemple #5
0
def normalize_url(url):
    # https://tools.ietf.org/html/rfc3986
    url = make_unicode(url)
    if RE_NOT_SAFE_CHAR.search(url):
        parts = list(urlsplit(url))
        # Scheme
        pass
        # Network location (user:pass@hostname)
        if RE_NON_ALPHA_DIGIT_NETLOC.search(parts[1]):
            parts[1] = parts[1].encode('idna')
        # Path
        parts[2] = quote(make_str(parts[2]), safe=RESERVED_CHARS)
        # Query
        parts[3] = quote(make_str(parts[3]), safe=RESERVED_CHARS)
        # Fragment
        parts[4] = quote(make_str(parts[4]), safe=RESERVED_CHARS)
        return urlunsplit(map(make_unicode, parts))
    return url
Exemple #6
0
    def find_link(self, href_pattern, make_absolute=True):
        """
        Find link in response body which href value matches ``href_pattern``.

        Returns found url or None.
        """

        if make_absolute:
            self.tree.make_links_absolute(self.response.url)

        if isinstance(href_pattern, six.text_type):
            raise GrabMisuseError('Method `find_link` accepts only '
                                  'byte-string argument')
        href_pattern = make_unicode(href_pattern)
        for elem, attr, link, pos in self.tree.iterlinks():
            if elem.tag == 'a' and href_pattern in link:
                return link
        return None
Exemple #7
0
    def find_link(self, href_pattern, make_absolute=True):
        """
        Find link in response body which href value matches ``href_pattern``.

        Returns found url or None.
        """

        if make_absolute:
            self.tree.make_links_absolute(self.response.url)

        if isinstance(href_pattern, six.text_type):
            raise GrabMisuseError('Method `find_link` accepts only '
                                  'byte-string argument')
        href_pattern = make_unicode(href_pattern)
        for elem, attr, link, pos in self.tree.iterlinks():
            if elem.tag == 'a' and href_pattern in link:
                return link
        return None
Exemple #8
0
    def prepare_response(self, grab):
        if self.body_file:
            self.body_file.close()
        response = Response()

        # py3 hack
        if six.PY3:
            bytes_head = b''.join(self.response_head_chunks)
        else:
            bytes_head = ''.join(self.response_head_chunks)
        response.head = make_unicode(bytes_head, errors='ignore')

        if self.body_path:
            response.body_path = self.body_path
        else:
            response.body = b''.join(self.response_body_chunks)

        # Clear memory
        self.response_head_chunks = []
        self.response_body_chunks = []

        response.code = self.curl.getinfo(pycurl.HTTP_CODE)
        response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
        response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
        response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME)
        response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
        response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
        response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
        response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

        response.url = self.curl.getinfo(pycurl.EFFECTIVE_URL)

        response.parse(charset=grab.config['document_charset'])

        response.cookies = CookieManager(self.extract_cookiejar())

        # We do not need anymore cookies stored in the
        # curl instance so drop them
        self.curl.setopt(pycurl.COOKIELIST, 'ALL')
        return response
Exemple #9
0
    def request(self):
        req = self._request

        if req.proxy:
            if req.proxy_userpwd:
                headers = make_headers(proxy_basic_auth=req.proxy_userpwd)
            else:
                headers = None
            proxy_url = '%s://%s' % (req.proxy_type, req.proxy)
            if req.proxy_type == 'socks5':
                pool = SOCKSProxyManager(
                    proxy_url,
                    cert_reqs='CERT_REQUIRED',
                    ca_certs=certifi.where())  # , proxy_headers=headers)
            else:
                pool = ProxyManager(proxy_url,
                                    proxy_headers=headers,
                                    cert_reqs='CERT_REQUIRED',
                                    ca_certs=certifi.where())
        else:
            pool = self.pool
        with self.wrap_transport_error():
            # Retries can be disabled by passing False:
            # http://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry
            # Do not use False because of warning:
            # Converted retries value: False -> Retry(total=False,
            # connect=None, read=None, redirect=0, status=None)
            retry = Retry(
                total=False,
                connect=False,
                read=False,
                redirect=0,
                status=None,
            )
            # The read timeout is not total response time timeout
            # It is the timeout on read of next data chunk from the server
            # Total response timeout is handled by Grab
            timeout = Timeout(connect=req.connect_timeout, read=req.timeout)
            #req_headers = dict((make_unicode(x), make_unicode(y))
            #                   for (x, y) in req.headers.items())
            if six.PY3:
                req_url = make_unicode(req.url)
                req_method = make_unicode(req.method)
            else:
                req_url = make_str(req.url)
                req_method = req.method
            req.op_started = time.time()
            try:
                res = pool.urlopen(req_method,
                                   req_url,
                                   body=req.data,
                                   timeout=timeout,
                                   retries=retry,
                                   headers=req.headers,
                                   preload_content=False)
            except UnicodeError as ex:
                raise error.GrabConnectionError('GrabInvalidUrl', ex)
        #except exceptions.ReadTimeoutError as ex:
        #    raise error.GrabTimeoutError('ReadTimeoutError', ex)
        #except exceptions.ConnectTimeoutError as ex:
        #    raise error.GrabConnectionError('ConnectTimeoutError', ex)
        #except exceptions.ProtocolError as ex:
        #    # TODO:
        #    # the code
        #    # raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1])
        #    # fails
        #    # with error TypeError: 'OSError' object is not subscriptable
        #    raise error.GrabConnectionError('ProtocolError', ex)
        #except exceptions.SSLError as ex:
        #    raise error.GrabConnectionError('SSLError', ex)

        # WTF?
        self.request_head = b''
        self.request_body = b''
        self.request_log = b''

        self._response = res
Exemple #10
0
    def process_config(self, grab):
        req = Request(data=None)

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(
                u'%s: %s' %
                (six.text_type(ex),
                 make_unicode(grab.config['url'], errors='ignore')))
        req.url = request_url

        method = grab.detect_request_method()
        req.method = make_str(method)

        req.config_body_maxsize = grab.config['body_maxsize']
        req.config_nobody = grab.config['nobody']

        req.timeout = grab.config['timeout']
        req.connect_timeout = grab.config['connect_timeout']

        extra_headers = {}

        # Body processing
        if grab.config['body_inmemory']:
            pass
        else:
            if not grab.config['body_storage_dir']:
                raise GrabMisuseError('Option body_storage_dir is not defined')
            file_, path_ = self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            req.response_file = file_
            req.response_path = path_

        if grab.config['multipart_post'] is not None:
            post_data = grab.config['multipart_post']
            if isinstance(post_data, six.binary_type):
                pass
            elif isinstance(post_data, six.text_type):
                raise GrabMisuseError('Option multipart_post data'
                                      ' does not accept unicode.')
            else:
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                post_items = decode_pairs(post_items, grab.config['charset'])
                post_items = process_upload_items(post_items)
                post_data, content_type = encode_multipart_formdata(post_items)
                extra_headers['Content-Type'] = content_type
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data
        elif grab.config['post'] is not None:
            post_data = normalize_post_data(grab.config['post'],
                                            grab.config['charset'])
            # py3 hack
            # if six.PY3:
            #    post_data = smart_unicode(post_data,
            #                              grab.config['charset'])
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data

        if method in ('POST', 'PUT'):
            if (grab.config['post'] is None
                    and grab.config['multipart_post'] is None):
                raise GrabMisuseError('Neither `post` or `multipart_post`'
                                      ' options was specified for the %s'
                                      ' request' % method)
        # Proxy
        if grab.config['proxy']:
            req.proxy = grab.config['proxy']

        if grab.config['proxy_userpwd']:
            req.proxy_userpwd = grab.config['proxy_userpwd']

        if grab.config['proxy_type']:
            req.proxy_type = grab.config['proxy_type']
        else:
            req.proxy_type = 'http'

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        extra_headers['User-Agent'] = grab.config['user_agent']

        # Headers
        headers = extra_headers
        headers.update(grab.config['common_headers'])

        if grab.config['headers']:
            headers.update(grab.config['headers'])
        req.headers = headers

        # Cookies
        self.process_cookie_options(grab, req)

        self._request = req
Exemple #11
0
 def test_make_unicode_from_none(self):
     self.assertEqual(
         make_unicode(None),
         u'None',
     )
Exemple #12
0
 def test_make_unicode_from_int(self):
     self.assertEqual(
         make_unicode(1),
         u'1',
     )
Exemple #13
0
 def test_make_unicode_from_unicode(self):
     self.assertEqual(
         make_unicode(u'фыва'),
         u'фыва',
     )
Exemple #14
0
 def test_make_unicode_from_str(self):
     self.assertEqual(
         make_unicode(u'фыва'.encode('utf-8')),
         u'фыва',
     )
Exemple #15
0
    def request(self):
        req = self._request

        if req.proxy:
            if req.proxy_userpwd:
                headers = make_headers(proxy_basic_auth=req.proxy_userpwd)
            else:
                headers = None
            proxy_url = '%s://%s' % (req.proxy_type, req.proxy)
            if req.proxy_type == 'socks5':
                pool = SOCKSProxyManager(
                    proxy_url,
                    cert_reqs='CERT_REQUIRED',
                    ca_certs=certifi.where()
                ) # , proxy_headers=headers)
            else:
                pool = ProxyManager(
                    proxy_url, proxy_headers=headers,
                    cert_reqs='CERT_REQUIRED',
                    ca_certs=certifi.where()
                )
        else:
            pool = self.pool
        try:
            # Retries can be disabled by passing False:
            # http://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry
            # Do not use False because of warning:
            # Converted retries value: False -> Retry(total=False,
            # connect=None, read=None, redirect=0, status=None)
            retry = Retry(
                total=False,
                connect=False,
                read=False,
                redirect=0,
                status=None,
            )
            # The read timeout is not total response time timeout
            # It is the timeout on read of next data chunk from the server
            # Total response timeout is handled by Grab
            timeout = Timeout(connect=req.connect_timeout,
                              read=req.timeout)
            #req_headers = dict((make_unicode(x), make_unicode(y))
            #                   for (x, y) in req.headers.items())
            if six.PY3:
                req_url = make_unicode(req.url)
                req_method = make_unicode(req.method)
            else:
                req_url = make_str(req.url)
                req_method = req.method
            req.op_started = time.time()
            try:
                res = pool.urlopen(req_method,
                                   req_url,
                                   body=req.data, timeout=timeout,
                                   retries=retry,
                                   headers=req.headers,
                                   preload_content=False)
            except UnicodeError as ex:
                raise error.GrabConnectionError('GrabInvalidUrl', ex)
        except exceptions.ReadTimeoutError as ex:
            raise error.GrabTimeoutError('ReadTimeoutError', ex)
        except exceptions.ConnectTimeoutError as ex:
            raise error.GrabConnectionError('ConnectTimeoutError', ex)
        except exceptions.ProtocolError as ex:
            # TODO:
            # the code
            # raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1])
            # fails
            # with error TypeError: 'OSError' object is not subscriptable
            raise error.GrabConnectionError('ProtocolError', ex)
        except exceptions.SSLError as ex:
            raise error.GrabConnectionError('SSLError', ex)

        # WTF?
        self.request_head = b''
        self.request_body = b''
        self.request_log = b''

        self._response = res
Exemple #16
0
    def process_config(self, grab):
        req = Request(data=None)

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(
                u'%s: %s' % (
                    six.text_type(ex),
                    make_unicode(grab.config['url'], errors='ignore')
                )
            )
        req.url = request_url

        method = grab.detect_request_method()
        req.method = make_str(method)

        req.config_body_maxsize = grab.config['body_maxsize']
        req.config_nobody = grab.config['nobody']

        req.timeout = grab.config['timeout']
        req.connect_timeout = grab.config['connect_timeout']

        extra_headers = {}

        # Body processing
        if grab.config['body_inmemory']:
            pass
        else:
            if not grab.config['body_storage_dir']:
                raise GrabMisuseError(
                    'Option body_storage_dir is not defined')
            file_, path_ = self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            req.response_file = file_
            req.response_path = path_

        if grab.config['multipart_post'] is not None:
            post_data = grab.config['multipart_post']
            if isinstance(post_data, six.binary_type):
                pass
            elif isinstance(post_data, six.text_type):
                raise GrabMisuseError('Option multipart_post data'
                                      ' does not accept unicode.')
            else:
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                post_items = decode_pairs(post_items,
                                          grab.config['charset'])
                post_items = process_upload_items(post_items)
                post_data, content_type = encode_multipart_formdata(post_items)
                extra_headers['Content-Type'] = content_type
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data
        elif grab.config['post'] is not None:
            post_data = normalize_post_data(grab.config['post'],
                                            grab.config['charset'])
            # py3 hack
            # if six.PY3:
            #    post_data = smart_unicode(post_data,
            #                              grab.config['charset'])
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data

        if method in ('POST', 'PUT'):
            if (grab.config['post'] is None
                    and grab.config['multipart_post'] is None):
                raise GrabMisuseError('Neither `post` or `multipart_post`'
                                      ' options was specified for the %s'
                                      ' request' % method)
        # Proxy
        if grab.config['proxy']:
            req.proxy = grab.config['proxy']

        if grab.config['proxy_userpwd']:
            req.proxy_userpwd = grab.config['proxy_userpwd']

        if grab.config['proxy_type']:
            req.proxy_type = grab.config['proxy_type']
        else:
            req.proxy_type = 'http'

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        extra_headers['User-Agent'] = grab.config['user_agent']

        # Headers
        headers = extra_headers
        headers.update(grab.config['common_headers'])

        if grab.config['headers']:
            headers.update(grab.config['headers'])
        req.headers = headers

        # Cookies
        self.process_cookie_options(grab, req)

        self._request = req