Example #1
0
    def __init__(self, request, timeout=180):
        self._url = urldefrag(request.url)[0]
        # converting to bytes to comply to Twisted interface
        self.url = to_bytes(self._url, encoding='ascii')
        self.method = to_bytes(request.method, encoding='ascii')
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.timeout = request.meta.get('download_timeout') or timeout
        self.start_time = time()
        self.deferred = defer.Deferred().addCallback(self._build_response, request)

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Scrapy implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault("Connection", "close")
        # Content-Length must be specified in POST method even with no body
        elif self.method == b'POST':
            self.headers['Content-Length'] = 0
 def get_defaults_spider_mw(self):
     crawler = get_crawler(Spider)
     spider = crawler._create_spider('foo')
     defaults = {
         to_bytes(k): [to_bytes(v)]
         for k, v in crawler.settings.get('DEFAULT_REQUEST_HEADERS').items()
     }
     return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
Example #3
0
 def testHostHeader(self):
     # if we pass Host header explicitly, it should be used, otherwise
     # it should extract from url
     return defer.gatherResults([
         getPage(self.getURL("host")).addCallback(
             self.assertEquals, to_bytes("127.0.0.1:%d" % self.portno)),
         getPage(self.getURL("host"), headers={"Host": "www.example.com"}).addCallback(
             self.assertEquals, to_bytes("www.example.com"))])
Example #4
0
 def renderRequest(self, request, nlist):
     s = """<html> <head></head> <body>"""
     args = request.args.copy()
     for nl in nlist:
         args[b"n"] = [to_bytes(str(nl))]
         argstr = urlencode(args, doseq=True)
         s += "<a href='/follow?%s'>follow %d</a><br>" % (argstr, nl)
     s += """</body>"""
     request.write(to_bytes(s))
     request.finish()
Example #5
0
def response_httprepr(response):
    """Return raw HTTP representation (as bytes) of the given response. This
    is provided only for reference, since it's not the exact stream of bytes
    that was received (that's not exposed by Twisted).
    """
    s = b"HTTP/1.1 " + to_bytes(str(response.status)) + b" " + \
        to_bytes(http.RESPONSES.get(response.status, b'')) + b"\r\n"
    if response.headers:
        s += response.headers.to_string() + b"\r\n"
    s += b"\r\n"
    s += response.body
    return s
Example #6
0
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
    return (
        to_native_str(parts.scheme),
        to_native_str(parts.netloc.encode('idna')),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),
        quote(to_bytes(parts.params, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars)
    )
Example #7
0
def request_httprepr(request):
    """Return the raw HTTP representation (as bytes) of the given request.
    This is provided only for reference since it's not the actual stream of
    bytes that will be send when performing the request (that's controlled
    by Twisted).
    """
    parsed = urlparse_cached(request)
    path = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, ""))
    s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
    s += b"Host: " + to_bytes(parsed.hostname) + b"\r\n"
    if request.headers:
        s += request.headers.to_string() + b"\r\n"
    s += b"\r\n"
    s += request.body
    return s
def multpart_encode(form_data):
    _boundary = to_bytes(''.join(
        random.choice(string.digits + string.ascii_letters) for i in range(20)))
    content_type = "multipart/form-data; boundary=" + _boundary

    body = []
    for name, value in form_data.items():
        body.append(b'--' + _boundary)
        body.append(b'Content-Disposition: form-data; name="' + to_bytes(name) + b'"')
        body.append(b'')
        body.append(to_bytes(value))

    body.append(b'--' + _boundary + b'--')

    return content_type, b'\r\n'.join(body)
Example #9
0
 def _build_response(self, result, request, protocol):
     self.result = result
     respcls = responsetypes.from_args(url=request.url)
     protocol.close()
     body = protocol.filename or protocol.body.read()
     headers = {"local filename": protocol.filename or '', "size": protocol.size}
     return respcls(url=request.url, status=200, body=to_bytes(body), headers=headers)
 def thumb_path(self, request, thumb_id, response=None, info=None):
     url = request.url
     image_guid = hashlib.sha1(to_bytes(url)).hexdigest()
     if self.conf["IMAGES_STORE_FORMAT"] == 'FLAT':
         return '{ig}.jpg'.format(ig=image_guid)
     else:
         return 'thumbs/{p}/{ig}.jpg'.format(p=thumb_id, ig=image_guid)
Example #11
0
    def thumb_path(self, request, thumb_id, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings

            warnings.warn(
                "ImagesPipeline.thumb_key(url) method is deprecated, please use "
                "thumb_path(request, thumb_id, response=None, info=None) instead",
                category=ScrapyDeprecationWarning,
                stacklevel=1,
            )

        # check if called from thumb_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if thumb_key() method has been overridden
        if not hasattr(self.thumb_key, "_base"):
            _warn()
            return self.thumb_key(url, thumb_id)
        ## end of deprecation warning block

        thumb_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return "thumbs/%s/%s.jpg" % (thumb_id, thumb_guid)
Example #12
0
def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    if isinstance(url, ParseResult):
        return url
    return urlparse(to_bytes(url, encoding))
Example #13
0
 def requestTunnel(self, protocol):
     """Asks the proxy to open a tunnel."""
     tunnelReq = (
         b'CONNECT ' +
         to_bytes(self._tunneledHost, encoding='ascii') + b':' +
         to_bytes(str(self._tunneledPort)) +
         b' HTTP/1.1\r\n')
     if self._proxyAuthHeader:
         tunnelReq += \
             b'Proxy-Authorization: ' + self._proxyAuthHeader + b'\r\n'
     tunnelReq += b'\r\n'
     protocol.transport.write(tunnelReq)
     self._protocolDataReceived = protocol.dataReceived
     protocol.dataReceived = self.processProxyResponse
     self._protocol = protocol
     return protocol
Example #14
0
 def _test_request(self, **kwargs):
     r = self.request_class('http://scrapytest.org/rpc2', **kwargs)
     self.assertEqual(r.headers[b'Content-Type'], b'text/xml')
     self.assertEqual(r.body, to_bytes(xmlrpclib.dumps(**kwargs)))
     self.assertEqual(r.method, 'POST')
     self.assertEqual(r.encoding, kwargs.get('encoding', 'utf-8'))
     self.assertTrue(r.dont_filter, True)
Example #15
0
    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        if hasattr(item, '_context'):
            itemdict['@context'] = item._context

        data = self.encoder.encode(itemdict) + '\n'
        self.file.write(to_bytes(data, self.encoding))
Example #16
0
    def testParse(self):
        lip = '127.0.0.1'
        tests = (
    ("http://127.0.0.1?c=v&c2=v2#fragment",     ('http', lip, lip, 80, '/?c=v&c2=v2')),
    ("http://127.0.0.1/?c=v&c2=v2#fragment",    ('http', lip, lip, 80, '/?c=v&c2=v2')),
    ("http://127.0.0.1/foo?c=v&c2=v2#frag",     ('http', lip, lip, 80, '/foo?c=v&c2=v2')),
    ("http://127.0.0.1:100?c=v&c2=v2#fragment", ('http', lip+':100', lip, 100, '/?c=v&c2=v2')),
    ("http://127.0.0.1:100/?c=v&c2=v2#frag",    ('http', lip+':100', lip, 100, '/?c=v&c2=v2')),
    ("http://127.0.0.1:100/foo?c=v&c2=v2#frag", ('http', lip+':100', lip, 100, '/foo?c=v&c2=v2')),

    ("http://127.0.0.1",              ('http', lip, lip, 80, '/')),
    ("http://127.0.0.1/",             ('http', lip, lip, 80, '/')),
    ("http://127.0.0.1/foo",          ('http', lip, lip, 80, '/foo')),
    ("http://127.0.0.1?param=value",  ('http', lip, lip, 80, '/?param=value')),
    ("http://127.0.0.1/?param=value", ('http', lip, lip, 80, '/?param=value')),
    ("http://127.0.0.1:12345/foo",    ('http', lip+':12345', lip, 12345, '/foo')),
    ("http://spam:12345/foo",         ('http', 'spam:12345', 'spam', 12345, '/foo')),
    ("http://spam.test.org/foo",      ('http', 'spam.test.org', 'spam.test.org', 80, '/foo')),

    ("https://127.0.0.1/foo",         ('https', lip, lip, 443, '/foo')),
    ("https://127.0.0.1/?param=value", ('https', lip, lip, 443, '/?param=value')),
    ("https://127.0.0.1:12345/",      ('https', lip+':12345', lip, 12345, '/')),

    ("http://scrapytest.org/foo ",    ('http', 'scrapytest.org', 'scrapytest.org', 80, '/foo')),
    ("http://egg:7890 ",              ('http', 'egg:7890', 'egg', 7890, '/')),
    )

        for url, test in tests:
            test = tuple(
                to_bytes(x) if not isinstance(x, int) else x for x in test)
            self.assertEquals(client._parse(url), test, url)
Example #17
0
    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                          'please use file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() or image_key() methods have been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)
        ## end of deprecation warning block

        image_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return 'full/%s.jpg' % (image_guid)
Example #18
0
    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
                          'file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() method has been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        ## end of deprecation warning block

        media_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        media_ext = os.path.splitext(url)[1]  # change to request.url after deprecation
        return 'full/%s%s' % (media_guid, media_ext)
Example #19
0
 def export_item(self, item):
     if self.first_item:
         self.first_item = False
     else:
         self.file.write(b",\n")
     itemdict = dict(self._get_serialized_fields(item))
     self.file.write(to_bytes(self.encoder.encode(itemdict)))
Example #20
0
 def to_native_str(text, encoding=None, errors='strict'):
     """ Return str representation of `text`
     (bytes in Python 2.x and unicode in Python 3.x). """
     if six.PY2:
         return to_bytes(text, encoding, errors)
     else:
         return to_unicode(text, encoding, errors)
Example #21
0
    def test_3xx_and_invalid_gzipped_body_must_redirect(self):
        """Regression test for a failure when redirecting a compressed
        request.

        This happens when httpcompression middleware is executed before redirect
        middleware and attempts to decompress a non-compressed body.
        In particular when some website returns a 30x response with header
        'Content-Encoding: gzip' giving as result the error below:

            exceptions.IOError: Not a gzipped file

        """
        req = Request('http://example.com')
        body = b'<p>You are being redirected</p>'
        resp = Response(req.url, status=302, body=body, headers={
            'Content-Length': str(len(body)),
            'Content-Type': 'text/html',
            'Content-Encoding': 'gzip',
            'Location': 'http://example.com/login',
        })
        ret = self._download(request=req, response=resp)
        self.assertTrue(isinstance(ret, Request),
                        "Not redirected: {0!r}".format(ret))
        self.assertEqual(to_bytes(ret.url), resp.headers['Location'],
                         "Not redirected to location header")
Example #22
0
 def test_get_key(self):
     mw = self.mwcls(self.temp_dir, reset=True)
     test_req1 = Request('http://url1')
     self.assertEqual(mw._get_key(test_req1),
                      to_bytes(request_fingerprint(test_req1)))
     test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'})
     self.assertEqual(mw._get_key(test_req2), b'dfkey1')
 def writeLine(self, data):
     """
     Write the data to the process stdin, adding the new-line delimiter if necessary
     """
     data = to_bytes(data)
     if not data.endswith(b'\n'):
         data += self.__delimiter
     self.transport.write(data)
Example #24
0
 def render_GET(self, request):
     output = {
         'headers': dict(
             (to_unicode(k), [to_unicode(v) for v in vs])
             for k, vs in request.requestHeaders.getAllRawHeaders()),
         'body': to_unicode(request.content.read()),
     }
     return to_bytes(json.dumps(output))
Example #25
0
 def export_item(self, item):
     if self.first_item:
         self.first_item = False
     else:
         self.file.write(b',\n')
     itemdict = dict(self._get_serialized_fields(item))
     data = self.encoder.encode(itemdict)
     self.file.write(to_bytes(data, self.encoding))
Example #26
0
 def browser_open(burl):
     path = urlparse(burl).path
     if not os.path.exists(path):
         path = burl.replace('file://', '')
     with open(path, "rb") as f:
         bbody = f.read()
     self.assertIn(b'<base href="' + to_bytes(url) + b'">', bbody)
     return True
 def file_path(self, request, response=None, info=None):
     url = request.url
     image_guid = hashlib.sha1(to_bytes(url)).hexdigest()
     if self.conf["IMAGES_STORE_FORMAT"] == 'FLAT':
         return '{ig}.jpg'.format(ig=image_guid)
     elif self.conf["IMAGES_STORE_FORMAT"] == 'THUMBS':
         return 'thumbs/{p}/{ig}.jpg'.format(p=next(iter(list(settings.get('IMAGES_THUMBS').keys()))), ig=image_guid)
     else:
         return 'full/{ig}.jpg'.format(ig=image_guid)
Example #28
0
 def _failed(self, result, request):
     message = result.getErrorMessage()
     if result.type == CommandFailed:
         m = _CODE_RE.search(message)
         if m:
             ftpcode = m.group()
             httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
             return Response(url=request.url, status=httpcode, body=to_bytes(message))
     raise result.type(result.value)
Example #29
0
def request_fingerprint(request, include_headers=None):
    """
    Return the request fingerprint.

    The request fingerprint is a hash that uniquely identifies the resource the
    request points to. For example, take the following two urls:

    http://www.example.com/query?id=111&cat=222
    http://www.example.com/query?cat=222&id=111

    Even though those are two different URLs both point to the same resource
    and are equivalent (ie. they should return the same response).

    Another example are cookies used to store session ids. Suppose the
    following page is only accesible to authenticated users:

    http://www.example.com/members/offers.html

    Lot of sites use a cookie to store the session id, which adds a random
    component to the HTTP Request and thus should be ignored when calculating
    the fingerprint.

    For this reason, request headers are ignored by default when calculating
    the fingeprint. If you want to include specific headers use the
    include_headers argument, which is a list of Request headers to include.

    """
    if include_headers:
        include_headers = tuple(to_bytes(h.lower())
                                 for h in sorted(include_headers))
    cache = _fingerprint_cache.setdefault(request, {})
    if include_headers not in cache:
        fp = hashlib.sha1()
        fp.update(to_bytes(request.method))
        fp.update(to_bytes(canonicalize_url(request.url)))
        fp.update(request.body or b'')
        if include_headers:
            for hdr in include_headers:
                if hdr in request.headers:
                    fp.update(hdr)
                    for v in request.headers.getlist(hdr):
                        fp.update(v)
        cache[include_headers] = fp.hexdigest()
    return cache[include_headers]
Example #30
0
 def test_timeoutNotTriggering(self):
     """
     When a non-zero timeout is passed to L{getPage} and the page is
     retrieved before the timeout period elapses, the L{Deferred} is
     called back with the contents of the page.
     """
     d = getPage(self.getURL("host"), timeout=100)
     d.addCallback(
         self.assertEquals, to_bytes("127.0.0.1:%d" % self.portno))
     return d
Example #31
0
def getPage(url,
            contextFactory=None,
            response_transform=None,
            *args,
            **kwargs):
    """Adapted version of twisted.web.client.getPage"""
    def _clientfactory(url, *args, **kwargs):
        url = to_unicode(url)
        timeout = kwargs.pop('timeout', 0)
        f = client.ScrapyHTTPClientFactory(Request(url, *args, **kwargs),
                                           timeout=timeout)
        f.deferred.addCallback(response_transform or (lambda r: r.body))
        return f

    from twisted.web.client import _makeGetterFactory
    return _makeGetterFactory(to_bytes(url),
                              _clientfactory,
                              contextFactory=contextFactory,
                              *args,
                              **kwargs).deferred
Example #32
0
    def _get_agent(self, request, timeout):
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxyHost = to_unicode(proxyHost)
            omitConnectTunnel = b'noconnect' in proxyParams
            if  scheme == b'https' and not omitConnectTunnel:
                proxyConf = (proxyHost, proxyPort,
                             request.headers.get(b'Proxy-Authorization', None))
                return self._TunnelingAgent(reactor, proxyConf,
                    contextFactory=self._contextFactory, connectTimeout=timeout,
                    bindAddress=bindaddress, pool=self._pool)
            else:
                return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'),
                    connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

        return self._Agent(reactor, contextFactory=self._contextFactory,
            connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
Example #33
0
    def process_item(self, item, spider):
        session = self.session()
        processing = Processing(s3_raw=hashlib.sha1(to_bytes(
            item['doc_url'])).hexdigest()).save_to_db(session)

        policy = Policy(country=item['country'],
                        geo_code=item['geo_code'],
                        level=item['level'],
                        source=item['source'],
                        title=item['title'],
                        reference=item['reference'],
                        authorship=item['authorship'],
                        resume=item['resume'],
                        publication_date=item['publication_date'],
                        enforcement_date=item['enforcement_date'],
                        url=item['url'],
                        doc_url=item['doc_url'],
                        doc_name=item['doc_name'],
                        doc_type=item['doc_type'],
                        processing_id=processing.id).save_to_db(session)
Example #34
0
    def process_request(self, request, spider):
        cookies = spider.cookies
        driver = init_chromium(spider.selenium_hostname, cookies)

        print('SeleniumMiddleware - getting the page')
        driver.get(request.url)

        # request.meta['driver'] = self.driver  # to access driver from response

        print('waiting for page loading')
        spider.wait_page_completion(driver=driver)

        print('SeleniumMiddleware - retrieving body')
        body = to_bytes(driver.page_source)  # body must be of type bytes

        request.meta['driver'] = driver
        return HtmlResponse(driver.current_url,
                            body=body,
                            encoding='utf-8',
                            request=request)
Example #35
0
    def file_path(self, request, response=None, info=None):
        """根据request返回存储文件的相对路径"""
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(
                'FilesPipeline.file_key(url) method is deprecated, please use '
                'file_path(request, response=None, info=None) instead',
                category=ScrapyDeprecationWarning,
                stacklevel=1)

        # check if called from file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        media_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # sha1 哈希算法
        media_ext = os.path.splitext(url)[1]
        return 'files/%s%s' % (media_guid, media_ext)
Example #36
0
def open_in_browser(response, _openfunc=webbrowser.open):
    """Open the given response in a local web browser, populating the <base>
    tag for external links to work
    """
    from scrapy import HtmlResponse, TextResponse
    # XXX: this implementation is a bit dirty and could be improved
    body = response.body
    if isinstance(response, HtmlResponse):
        if b'<base' not in body:
            repl = '<head><base href="%s">' % response.url
            body = body.replace(b'<head>', to_bytes(repl))
        ext = '.html'
    elif isinstance(response, TextResponse):
        ext = '.txt'
    else:
        raise TypeError("Unsupported response type: %s" %
                        response.__class__.__name__)
    fd, fname = tempfile.mkstemp(ext)
    os.write(fd, body)
    os.close(fd)
    return _openfunc("file://%s" % fname)
Example #37
0
    def file_path(self, request, response=None, info=None, *, item=None):
        # function 2
        img_path = ""
        try:
            # HERE ARE CUSTOM CHANGES
            marketplace = request.meta.get("marketplace")
            asin = request.meta.get("asin")

            img_path = os.path.join(marketplace, asin + ".jpg")
        except:
            media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
            media_ext = os.path.splitext(request.url)[1]
            # Handles empty and wild extensions by trying to guess the
            # mime type then extension or default to empty string otherwise
            if media_ext not in mimetypes.types_map:
                media_ext = ''
                media_type = mimetypes.guess_type(request.url)[0]
                if media_type:
                    media_ext = mimetypes.guess_extension(media_type)
            img_path = f'full/{media_guid}{media_ext}'
        return img_path
Example #38
0
def run(output_path, download_workers):
    crawl(download_workers)
    reactor.run()  # the script will block here until the last crawl call is finished
    encoder = ScrapyJSONEncoder()
    with open("results/publications.jl", "rb+") as publications_info_file:
        with open(output_path, "wb+") as output_file:
            output_file.write(b"[")
            first = True
            while True:
                line = publications_info_file.readline()
                if not line:
                    break
                if first:
                    output_file.write(b"\n")
                    first = False
                else:
                    output_file.write(b",\n")
                company_info_data = json.loads(line)
                data = encoder.encode(company_info_data)
                output_file.write(to_bytes(data))
            output_file.write(b"\n]\n")
Example #39
0
    def _get_agent(self, request: Request,
                   timeout: Optional[float]) -> H2Agent:
        from twisted.internet import reactor
        bind_address = request.meta.get('bindaddress') or self._bind_address
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxy_host, proxy_port, proxy_params = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxy_host = proxy_host.decode()
            omit_connect_tunnel = b'noconnect' in proxy_params
            if omit_connect_tunnel:
                warnings.warn(
                    "Using HTTPS proxies in the noconnect mode is not "
                    "supported by the downloader handler. If you use Zyte "
                    "Smart Proxy Manager, it doesn't require this mode "
                    "anymore, so you should update scrapy-crawlera to "
                    "scrapy-zyte-smartproxy and remove '?noconnect' from the "
                    "Zyte Smart Proxy Manager URL.")

            if scheme == b'https' and not omit_connect_tunnel:
                # ToDo
                raise NotImplementedError(
                    'Tunneling via CONNECT method using HTTP/2.0 is not yet supported'
                )
            return self._ProxyAgent(
                reactor=reactor,
                context_factory=self._context_factory,
                proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')),
                connect_timeout=timeout,
                bind_address=bind_address,
                pool=self._pool,
            )

        return self._Agent(
            reactor=reactor,
            context_factory=self._context_factory,
            connect_timeout=timeout,
            bind_address=bind_address,
            pool=self._pool,
        )
Example #40
0
    def page_parse(self, response):
        # pagetitle=response.xpath('/html/head/title').extract_first()
        # print("detail parsing:", pagetitle)

        text1 = response.xpath('//*[@class="cp-js"]/p/text()').extract()
        # text2 = response.xpath('//*[@class="wz-box"]/div/p/text()').extract_first()
        item = WineItem()

        item['url'] = response.url
        item['slogan'] = re.sub(r'—*\s+', '', text1[0])  # slogon
        item['name'] = re.sub(r':', '', text1[1])  # 名称
        item['odor_type'] = re.sub(r':', '', text1[2])  # 香型
        item['volume'] = re.sub(r':', '', text1[3])  # 酒精度
        item['net_weight'] = re.sub(r':', '', text1[4])  # 净含量
        item['general_agency'] = re.sub(r':', '', text1[5])  # 总代理
        item['package'] = re.sub(r':', '', text1[6])  # 包装
        item['price'] = ''  # 价格

        item['sku'] = hashlib.sha1(to_bytes(item['name'])).hexdigest()
        item['images_urls_local'] = 'http://192.168.33.55/pics/' + item[
            'sku'] + '/1.jpg'

        #总代理
        # print(re.sub(r':+', '', text1[5]))

        # text2 = response.xpath('//*[@class="wz-box"]/div/p/text()').extract()
        # print(re.sub(r'\s+', '', text2))

        pictureUrls = response.xpath(
            '//*[@class="items"]/ul/li/img/@bimg').extract()

        img_urls = []
        for picture in pictureUrls:
            pictureUrl = request.urljoin(response.url, picture)
            img_urls.append(pictureUrl)

        item['images_urls'] = img_urls

        yield item
Example #41
0
    def process_request(self, request, spider):
        driver = spider.driver

        print('SeleniumMiddleware - getting the page')
        driver.get(request.url)

        # request.meta['driver'] = self.driver  # to access driver from response

        print('waiting for page loading')
        profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
        get_by_xpath(driver, profile_xpath)

        # waiting links to other users are shown so the crawl can continue
        get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)

        print('SeleniumMiddleware - retrieving body')
        body = to_bytes(driver.page_source)  # body must be of type bytes

        return HtmlResponse(driver.current_url,
                            body=body,
                            encoding='utf-8',
                            request=request)
Example #42
0
  def parse_items(self, response):     
      
      items = []
      item = AquariumScapeItem()
 
      src = response.css('#index_photo img::attr(src)').extract_first()
      tank_img = response.urljoin(src)
      item['image_urls'] = [tank_img]
      image_guid = hashlib.sha1(to_bytes(tank_img)).hexdigest()  # change to request.url after deprecation
      item['image_sha1'] = '%s.jpg' % (image_guid)
      item['user_name'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "reeftank_panel_top", " " ))]//span/text()').extract()
      item['rank'] = response.selector.css('.margin_b+ b::text').extract()
      item['average_category_score'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "clear", " " )) and (((count(preceding-sibling::*) + 1) = 7) and parent::*)]//b/text()').extract()
      item['average_tank_score'] = response.selector.css('.clear:nth-child(5) b::text').extract()
      item['num_ratings'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "clear", " " )) and (((count(preceding-sibling::*) + 1) = 3) and parent::*)]//b/text()').extract()
      item['fish_kept'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "spe-list2", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "right", " " ))]/text()').extract()
      item['plants_kept'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "spe-list1", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "right", " " ))]/text()').extract()
      item['comments'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "image_comment", " " ))]/text()').extract()
      # Return all the found items
      items.append(item)
      return items
      
Example #43
0
    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(
                'ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                'please use file_path(request, response=None, info=None) instead',
                category=ScrapyDeprecationWarning,
                stacklevel=1)

        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() or image_key() methods have been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)
        ## end of deprecation warning block

        image_guid = hashlib.sha1(to_bytes(
            url)).hexdigest()  # change to request.url after deprecation
        folder = request.meta['folder']
        if len(folder):
            folder = folder[0]
            #pdb.set_trace()
            table = dict((ord(char), None) for char in "|\\?*<\":>+[]/'")
            folder = folder.translate(table)
        else:
            folder = u'未知'
        #pdb.set_trace()
        return '%s/%s.jpg' % (folder, image_guid)
Example #44
0
def jsonrpc_client_call(url, method, *args, **kwargs):
    """Execute a JSON-RPC call on the given url"""
    if args and kwargs:
        raise ValueError(
            "Pass *args or **kwargs but not both to jsonrpc_client_call")
    req = {
        'jsonrpc': '2.0',
        'method': method,
        'params': args or kwargs,
        'id': 1
    }
    data = to_bytes(json.dumps(req))
    body = urllib.request.urlopen(url, data).read()
    res = json.loads(body.decode('utf-8'))
    if 'result' in res:
        return res['result']
    elif 'error' in res:
        er = res['error']
        raise JsonRpcError(er['code'], er['message'], er['data'])
    else:
        msg = "JSON-RPC response must contain 'result' or 'error': %s" % res
        raise ValueError(msg)
Example #45
0
    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b',')
            self._beautify_newline()


        __item_properties = dict(self._get_serialized_fields(item))

        __item = {
            'type': 'Feature',
            'properties': __item_properties,
            'geometry': {
                'type': 'Point',
                'coordinates': (
                    float(__item_properties.get('longitude', 0.0)),
                    float(__item_properties.get('latitude', 0.0)))}}

        self.file.write(to_bytes(
            self.encoder.encode(__item),
            self.encoding))
    def test_ftp_local_filename(self):
        f, local_fname = tempfile.mkstemp()
        local_fname = to_bytes(local_fname)
        os.close(f)
        meta = {"ftp_local_filename": local_fname}
        meta.update(self.req_meta)
        request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
                          meta=meta)
        d = self.download_handler.download_request(request, None)

        def _test(r):
            self.assertEqual(r.body, local_fname)
            self.assertEqual(r.headers, {
                b'Local Filename': [local_fname],
                b'Size': [b'17']
            })
            self.assertTrue(os.path.exists(local_fname))
            with open(local_fname, "rb") as f:
                self.assertEqual(f.read(), b"I have the power!")
            os.remove(local_fname)

        return self._add_test_callbacks(d, _test)
Example #47
0
    def file_path(self, request, response=None, info=None):
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                          'please use file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)

        image_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return 'full/movie250/%s.jpg' % (image_guid)
Example #48
0
    def process_item(self, item, spider):
        session = self.session()
        processing = Processing(s3_raw=hashlib.sha1(to_bytes(item['doc_url'])).hexdigest())
        session.add(processing)

        policy = Policy(
            country=item['country'],
            state = item['state'],
            data_source=item['data_source'],
            law_class = item['law_class'],
            title=item['title'],
            reference=item['reference'],
            authorship=item['authorship'],
            resume=item['summary'],
            publication_date=item['publication_date'],
            url=item['url'],
            doc_url=item['doc_url'] + self.API_key,
            doc_name=item['doc_name'],
            processing = processing
        )
        session.merge(policy)
        #print(policy)
        session.commit()
Example #49
0
 def _downloadImg(imgUrl):
     savePrefix = "/root/SPIDERIMAGESDB/DATASOURCE/Lofter/" + strDate + "/"
     if not os.path.isdir(savePrefix):
         os.makedirs(savePrefix)
     image_guid = hashlib.sha1(to_bytes(imgUrl)).hexdigest()
     image_name = None
     if ".jpg" in imgUrl:
         image_name = image_guid + ".jpg"
     if ".png" in imgUrl:
         image_name = image_guid + ".png"
     if ".jpeg" in imgUrl:
         image_name = image_guid + ".jpeg"
     if image_name == None:
         return
     try:
         with open(savePrefix+image_name, "wb") as writer:
             writer.write(urllib.request.urlopen(imgUrl).read())
     except Exception as es:
         print(es)
     with codecs.open("./download" + strDate + ".csv", mode='a', encoding='utf8') as writer:
         writer.write(imgUrl+',Lofter/'+strDate+'/'+image_name)
         writer.write('\n')
         print(imgUrl, image_name)
Example #50
0
    def process_request(self, request, spider):
        self.driver.get(request.url)
        if spider.name == 'wanted' and request.meta != {}:
            time.sleep(2)
            some_tag = self.driver.find_element_by_class_name(
                '_3h_f6TfissC0l7ogPcn7lY')
            action = ActionChains(self.driver)
            action.move_to_element(some_tag).perform()
            time.sleep(1)
        else:
            time.sleep(1.2)
        if spider.name == 'wanted' and request.meta == {}:
            self.infinite_scroll_control()
        elif spider.name == 'roketpunch' and 'job_card_company' in request.meta:
            self.see_more_button_click()
        elif spider.name == 'naver' and request.meta == {}:
            self.naver_more_button()

        body = to_bytes(text=self.driver.page_source)
        return HtmlResponse(url=request.url,
                            body=body,
                            encoding='utf-8',
                            request=request)
Example #51
0
    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(
                'ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                'please use file_path(request, response=None, info=None) instead',
                category=ScrapyDeprecationWarning,
                stacklevel=1)

        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() or image_key() methods have been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)
        ## end of deprecation warning block

        image_guid = hashlib.sha1(to_bytes(
            url)).hexdigest()  # change to request.url after deprecation

        # 重新此方法,如果 item 中含有 save_sub_dir, 则将其放入此子文件夹
        item = request.meta['item']
        if 'save_sub_dir' in item:
            path = '%s/full/%s.jpg' % (item['save_sub_dir'], image_guid)
        else:
            path = 'full/%s.jpg' % image_guid
        return path
Example #52
0
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        if spider.browser:
            request.meta[
                'browser'] = self.browser  # to access driver from response
            self.browser.get(request.url)
            # wait js eval
            time.sleep(15)
            body = to_bytes(
                self.browser.page_source)  # body must be of type bytes
            return HtmlResponse(self.browser.current_url,
                                body=body,
                                encoding='utf-8',
                                request=request)
        else:
            return None
Example #53
0
    def parse_page(self, response):
        item = MeituItem()
        item['referer'] = response.meta['item']['referer']
        href = response.xpath("//div[@class='content-pic']/a/img/@src").extract_first()
        image_name = response.xpath("//div[@class='content']/h5/text()").extract_first()
        item['image_name'] = image_name.split('(')[0]
        item['image_hash'] = hashlib.sha1(to_bytes(item['image_name'])).hexdigest()
        try:
            item['image_num'] = image_name.split('(')[1].split(')')[0].zfill(2)
        except IndexError:
            item['image_num'] = '01'
        item['image_urls'] = [href]
        item['spider_name'] = self.name
        yield item

        # next page
        next_href = response.xpath("//a[text()='下一页']/@href").extract_first()
        if next_href:
            next_href = urljoin(response.url, next_href)
            yield scrapy.Request(
                url=next_href,
                callback=self.parse_page,
                meta={'item': {'referer': response.url}}
            )
Example #54
0
    def process_request(self, request, spider):
        driver = spider.driver

        logger.info('SeleniumMiddleware - getting the page')
        driver.get(request.url)

        logger.info('SeleniumMiddleware - click more options')
        more_option = get_by_xpath(
            driver, '//div/div/button[@class="more-options dropdown-caret"]')
        more_option.send_keys(Keys.NULL)
        more_option.click()

        logger.info('SeleniumMiddleware - wait for names')
        name = get_by_xpath(driver, '//ul[@class="browse-map-list"]/li/h4/a')
        name.send_keys(Keys.NULL)

        #request.meta['driver'] = self.driver  # to access driver from response

        logging.info('SeleniumMiddleware - retrieving body')
        body = to_bytes(driver.page_source)  # body must be of type bytes
        return HtmlResponse(driver.current_url,
                            body=body,
                            encoding='utf-8',
                            request=request)
Example #55
0
 def store_response(self, spider, request, response):
     """Store the given response in the cache."""
     rpath = self._get_request_path(spider, request)
     if not os.path.exists(rpath):
         os.makedirs(rpath)
     metadata = {
         'url': request.url,
         'method': request.method,
         'status': response.status,
         'response_url': response.url,
         'timestamp': time(),
     }
     with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
         f.write(to_bytes(repr(metadata)))
     with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
         pickle.dump(metadata, f, protocol=2)
     with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
         f.write(headers_dict_to_raw(response.headers))
     with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
         f.write(response.body)
     with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
         f.write(headers_dict_to_raw(request.headers))
     with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
         f.write(request.body)
Example #56
0
 def store_response(self, spider, request, response):
     """Store the given response in the cache."""
     rpath = self._get_request_path(spider, request)
     if not os.path.exists(rpath):
         os.makedirs(rpath)
     metadata = {
         "url": request.url,
         "method": request.method,
         "status": response.status,
         "response_url": response.url,
         "timestamp": time(),
     }
     with self._open(os.path.join(rpath, "meta"), "wb") as f:
         f.write(to_bytes(repr(metadata)))
     with self._open(os.path.join(rpath, "pickled_meta"), "wb") as f:
         pickle.dump(metadata, f, protocol=4)
     with self._open(os.path.join(rpath, "response_headers"), "wb") as f:
         f.write(headers_dict_to_raw(response.headers))
     with self._open(os.path.join(rpath, "response_body"), "wb") as f:
         f.write(response.body)
     with self._open(os.path.join(rpath, "request_headers"), "wb") as f:
         f.write(headers_dict_to_raw(request.headers))
     with self._open(os.path.join(rpath, "request_body"), "wb") as f:
         f.write(request.body)
Example #57
0
    def process_item(self, item, spider):
        '''
        图片下载
        :param item:
        :param spider:
        :return:
        '''
        for image in item['image_urls'].split(','):
            #图片命名
            image_guid = hashlib.sha1(to_bytes(image)).hexdigest()
            image_name = '%s.jpg' % (image_guid)

            house_id = item['from_url'].split('/')[-1].replace('.html', '')
            file_path = '%s/%s' % (self.imagepath, house_id)

            if not os.path.exists(file_path):
                os.makedirs(file_path)

            image_path = '%s/%s/%s' % (self.imagepath, house_id, image_name)

            if not os.path.exists(image_path):
                urlretrieve(image, image_path)
            else:
                raise DropItem('It exists!')
Example #58
0
    def file_path(self, request, response=None, info=None):
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(
                'ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                'please use file_path(request, response=None, info=None) instead',
                category=ScrapyDeprecationWarning,
                stacklevel=1)

        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url
        image_guid = hashlib.sha1(to_bytes(
            url)).hexdigest()  # change to request.url after deprecation
        word = str(request.meta['word']).split('word=')[-1]
        try:
            word = urllib.unquote(word).decode('utf-8')
        except Exception as ex:
            word = urllib.parse.unquote(word)
        print(word + "/%s.jpg" % image_guid)
        return word + '/%s.jpg' % (image_guid)
Example #59
0
    def thumb_path(self, request, thumb_id, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.thumb_key(url) method is deprecated, please use '
                          'thumb_path(request, thumb_id, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from thumb_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if thumb_key() method has been overridden
        if not hasattr(self.thumb_key, '_base'):
            _warn()
            return self.thumb_key(url, thumb_id)
        ## end of deprecation warning block

        thumb_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
Example #60
0
    def file_path(self, request, response=None, info=None):
        start_time = self._get_start_time(info.spider)
        start_time_str = start_time.strftime('%Y%m%d_%H%M%S')
        content_type = ''
        if response:
            # This is to cover the case when the url has . after the last /
            # and the text after the . is not a file extension but the response is a json
            content_type = to_native_str(response.headers['Content-Type'])
        url = request.url
        media_guid = hashlib.sha1(to_bytes(url)).hexdigest()
        media_ext = os.path.splitext(url)[1]

        if hasattr(info.spider, 'ext'):
            media_ext = info.spider.ext
        elif not media_ext or ('json' in content_type
                               and media_ext != '.json'):
            media_ext = '.json'
        # Put files in a directory named after the scraper they came from, and the scraper starttime
        if hasattr(info.spider, 'sample') and info.spider.sample == 'true':
            return '%s_sample/%s/%s%s' % (info.spider.name, start_time_str,
                                          media_guid, media_ext)
        else:
            return '%s/%s/%s%s' % (info.spider.name, start_time_str,
                                   media_guid, media_ext)