コード例 #1
0
ファイル: webclient.py プロジェクト: 390218462/scrapy
    def __init__(self, request, timeout=180):
        self._url = urldefrag(request.url)[0]
        # converting to bytes to comply to Twisted interface
        self.url = to_bytes(self._url, encoding='ascii')
        self.method = to_bytes(request.method, encoding='ascii')
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.timeout = request.meta.get('download_timeout') or timeout
        self.start_time = time()
        self.deferred = defer.Deferred().addCallback(self._build_response, request)

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Scrapy implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault("Connection", "close")
        # Content-Length must be specified in POST method even with no body
        elif self.method == b'POST':
            self.headers['Content-Length'] = 0
コード例 #2
0
 def get_defaults_spider_mw(self):
     crawler = get_crawler(Spider)
     spider = crawler._create_spider('foo')
     defaults = {
         to_bytes(k): [to_bytes(v)]
         for k, v in crawler.settings.get('DEFAULT_REQUEST_HEADERS').items()
     }
     return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
コード例 #3
0
ファイル: test_webclient.py プロジェクト: 01-/scrapy
 def testHostHeader(self):
     # if we pass Host header explicitly, it should be used, otherwise
     # it should extract from url
     return defer.gatherResults([
         getPage(self.getURL("host")).addCallback(
             self.assertEquals, to_bytes("127.0.0.1:%d" % self.portno)),
         getPage(self.getURL("host"), headers={"Host": "www.example.com"}).addCallback(
             self.assertEquals, to_bytes("www.example.com"))])
コード例 #4
0
ファイル: mockserver.py プロジェクト: 0daybug/scrapy
 def renderRequest(self, request, nlist):
     s = """<html> <head></head> <body>"""
     args = request.args.copy()
     for nl in nlist:
         args[b"n"] = [to_bytes(str(nl))]
         argstr = urlencode(args, doseq=True)
         s += "<a href='/follow?%s'>follow %d</a><br>" % (argstr, nl)
     s += """</body>"""
     request.write(to_bytes(s))
     request.finish()
コード例 #5
0
ファイル: response.py プロジェクト: ArturGaspar/scrapy
def response_httprepr(response):
    """Return raw HTTP representation (as bytes) of the given response. This
    is provided only for reference, since it's not the exact stream of bytes
    that was received (that's not exposed by Twisted).
    """
    s = b"HTTP/1.1 " + to_bytes(str(response.status)) + b" " + \
        to_bytes(http.RESPONSES.get(response.status, b'')) + b"\r\n"
    if response.headers:
        s += response.headers.to_string() + b"\r\n"
    s += b"\r\n"
    s += response.body
    return s
コード例 #6
0
ファイル: url.py プロジェクト: 447327642/scrapy
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
    return (
        to_native_str(parts.scheme),
        to_native_str(parts.netloc.encode('idna')),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),
        quote(to_bytes(parts.params, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars)
    )
コード例 #7
0
ファイル: request.py プロジェクト: nguyenhongson03/scrapy
def request_httprepr(request):
    """Return the raw HTTP representation (as bytes) of the given request.
    This is provided only for reference since it's not the actual stream of
    bytes that will be send when performing the request (that's controlled
    by Twisted).
    """
    parsed = urlparse_cached(request)
    path = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, ""))
    s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
    s += b"Host: " + to_bytes(parsed.hostname) + b"\r\n"
    if request.headers:
        s += request.headers.to_string() + b"\r\n"
    s += b"\r\n"
    s += request.body
    return s
コード例 #8
0
def multpart_encode(form_data):
    _boundary = to_bytes(''.join(
        random.choice(string.digits + string.ascii_letters) for i in range(20)))
    content_type = "multipart/form-data; boundary=" + _boundary

    body = []
    for name, value in form_data.items():
        body.append(b'--' + _boundary)
        body.append(b'Content-Disposition: form-data; name="' + to_bytes(name) + b'"')
        body.append(b'')
        body.append(to_bytes(value))

    body.append(b'--' + _boundary + b'--')

    return content_type, b'\r\n'.join(body)
コード例 #9
0
ファイル: ftp.py プロジェクト: ArturGaspar/scrapy
 def _build_response(self, result, request, protocol):
     self.result = result
     respcls = responsetypes.from_args(url=request.url)
     protocol.close()
     body = protocol.filename or protocol.body.read()
     headers = {"local filename": protocol.filename or '', "size": protocol.size}
     return respcls(url=request.url, status=200, body=to_bytes(body), headers=headers)
コード例 #10
0
 def thumb_path(self, request, thumb_id, response=None, info=None):
     url = request.url
     image_guid = hashlib.sha1(to_bytes(url)).hexdigest()
     if self.conf["IMAGES_STORE_FORMAT"] == 'FLAT':
         return '{ig}.jpg'.format(ig=image_guid)
     else:
         return 'thumbs/{p}/{ig}.jpg'.format(p=thumb_id, ig=image_guid)
コード例 #11
0
ファイル: images.py プロジェクト: GregoryVigoTorres/scrapy
    def thumb_path(self, request, thumb_id, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings

            warnings.warn(
                "ImagesPipeline.thumb_key(url) method is deprecated, please use "
                "thumb_path(request, thumb_id, response=None, info=None) instead",
                category=ScrapyDeprecationWarning,
                stacklevel=1,
            )

        # check if called from thumb_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if thumb_key() method has been overridden
        if not hasattr(self.thumb_key, "_base"):
            _warn()
            return self.thumb_key(url, thumb_id)
        ## end of deprecation warning block

        thumb_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return "thumbs/%s/%s.jpg" % (thumb_id, thumb_guid)
コード例 #12
0
ファイル: url.py プロジェクト: naisanza/scrapy
def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    if isinstance(url, ParseResult):
        return url
    return urlparse(to_bytes(url, encoding))
コード例 #13
0
ファイル: http11.py プロジェクト: bedreamer/scrapy
 def requestTunnel(self, protocol):
     """Asks the proxy to open a tunnel."""
     tunnelReq = (
         b'CONNECT ' +
         to_bytes(self._tunneledHost, encoding='ascii') + b':' +
         to_bytes(str(self._tunneledPort)) +
         b' HTTP/1.1\r\n')
     if self._proxyAuthHeader:
         tunnelReq += \
             b'Proxy-Authorization: ' + self._proxyAuthHeader + b'\r\n'
     tunnelReq += b'\r\n'
     protocol.transport.write(tunnelReq)
     self._protocolDataReceived = protocol.dataReceived
     protocol.dataReceived = self.processProxyResponse
     self._protocol = protocol
     return protocol
コード例 #14
0
 def _test_request(self, **kwargs):
     r = self.request_class('http://scrapytest.org/rpc2', **kwargs)
     self.assertEqual(r.headers[b'Content-Type'], b'text/xml')
     self.assertEqual(r.body, to_bytes(xmlrpclib.dumps(**kwargs)))
     self.assertEqual(r.method, 'POST')
     self.assertEqual(r.encoding, kwargs.get('encoding', 'utf-8'))
     self.assertTrue(r.dont_filter, True)
コード例 #15
0
ファイル: exporters.py プロジェクト: ericmoritz/scrapy-music
    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        if hasattr(item, '_context'):
            itemdict['@context'] = item._context

        data = self.encoder.encode(itemdict) + '\n'
        self.file.write(to_bytes(data, self.encoding))
コード例 #16
0
ファイル: test_webclient.py プロジェクト: 01-/scrapy
    def testParse(self):
        lip = '127.0.0.1'
        tests = (
    ("http://127.0.0.1?c=v&c2=v2#fragment",     ('http', lip, lip, 80, '/?c=v&c2=v2')),
    ("http://127.0.0.1/?c=v&c2=v2#fragment",    ('http', lip, lip, 80, '/?c=v&c2=v2')),
    ("http://127.0.0.1/foo?c=v&c2=v2#frag",     ('http', lip, lip, 80, '/foo?c=v&c2=v2')),
    ("http://127.0.0.1:100?c=v&c2=v2#fragment", ('http', lip+':100', lip, 100, '/?c=v&c2=v2')),
    ("http://127.0.0.1:100/?c=v&c2=v2#frag",    ('http', lip+':100', lip, 100, '/?c=v&c2=v2')),
    ("http://127.0.0.1:100/foo?c=v&c2=v2#frag", ('http', lip+':100', lip, 100, '/foo?c=v&c2=v2')),

    ("http://127.0.0.1",              ('http', lip, lip, 80, '/')),
    ("http://127.0.0.1/",             ('http', lip, lip, 80, '/')),
    ("http://127.0.0.1/foo",          ('http', lip, lip, 80, '/foo')),
    ("http://127.0.0.1?param=value",  ('http', lip, lip, 80, '/?param=value')),
    ("http://127.0.0.1/?param=value", ('http', lip, lip, 80, '/?param=value')),
    ("http://127.0.0.1:12345/foo",    ('http', lip+':12345', lip, 12345, '/foo')),
    ("http://spam:12345/foo",         ('http', 'spam:12345', 'spam', 12345, '/foo')),
    ("http://spam.test.org/foo",      ('http', 'spam.test.org', 'spam.test.org', 80, '/foo')),

    ("https://127.0.0.1/foo",         ('https', lip, lip, 443, '/foo')),
    ("https://127.0.0.1/?param=value", ('https', lip, lip, 443, '/?param=value')),
    ("https://127.0.0.1:12345/",      ('https', lip+':12345', lip, 12345, '/')),

    ("http://scrapytest.org/foo ",    ('http', 'scrapytest.org', 'scrapytest.org', 80, '/foo')),
    ("http://egg:7890 ",              ('http', 'egg:7890', 'egg', 7890, '/')),
    )

        for url, test in tests:
            test = tuple(
                to_bytes(x) if not isinstance(x, int) else x for x in test)
            self.assertEquals(client._parse(url), test, url)
コード例 #17
0
ファイル: images.py プロジェクト: ArturGaspar/scrapy
    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                          'please use file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() or image_key() methods have been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)
        ## end of deprecation warning block

        image_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return 'full/%s.jpg' % (image_guid)
コード例 #18
0
ファイル: files.py プロジェクト: CPoirot3/scrapy
    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
                          'file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() method has been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        ## end of deprecation warning block

        media_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        media_ext = os.path.splitext(url)[1]  # change to request.url after deprecation
        return 'full/%s%s' % (media_guid, media_ext)
コード例 #19
0
ファイル: exporters.py プロジェクト: voith/scrapy
 def export_item(self, item):
     if self.first_item:
         self.first_item = False
     else:
         self.file.write(b",\n")
     itemdict = dict(self._get_serialized_fields(item))
     self.file.write(to_bytes(self.encoder.encode(itemdict)))
コード例 #20
0
ファイル: utils.py プロジェクト: yilunanxia/scrapy-splash
 def to_native_str(text, encoding=None, errors='strict'):
     """ Return str representation of `text`
     (bytes in Python 2.x and unicode in Python 3.x). """
     if six.PY2:
         return to_bytes(text, encoding, errors)
     else:
         return to_unicode(text, encoding, errors)
コード例 #21
0
    def test_3xx_and_invalid_gzipped_body_must_redirect(self):
        """Regression test for a failure when redirecting a compressed
        request.

        This happens when httpcompression middleware is executed before redirect
        middleware and attempts to decompress a non-compressed body.
        In particular when some website returns a 30x response with header
        'Content-Encoding: gzip' giving as result the error below:

            exceptions.IOError: Not a gzipped file

        """
        req = Request('http://example.com')
        body = b'<p>You are being redirected</p>'
        resp = Response(req.url, status=302, body=body, headers={
            'Content-Length': str(len(body)),
            'Content-Type': 'text/html',
            'Content-Encoding': 'gzip',
            'Location': 'http://example.com/login',
        })
        ret = self._download(request=req, response=resp)
        self.assertTrue(isinstance(ret, Request),
                        "Not redirected: {0!r}".format(ret))
        self.assertEqual(to_bytes(ret.url), resp.headers['Location'],
                         "Not redirected to location header")
コード例 #22
0
ファイル: test_deltafetch.py プロジェクト: nyov/scrapylib
 def test_get_key(self):
     mw = self.mwcls(self.temp_dir, reset=True)
     test_req1 = Request('http://url1')
     self.assertEqual(mw._get_key(test_req1),
                      to_bytes(request_fingerprint(test_req1)))
     test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'})
     self.assertEqual(mw._get_key(test_req2), b'dfkey1')
コード例 #23
0
 def writeLine(self, data):
     """
     Write the data to the process stdin, adding the new-line delimiter if necessary
     """
     data = to_bytes(data)
     if not data.endswith(b'\n'):
         data += self.__delimiter
     self.transport.write(data)
コード例 #24
0
ファイル: mockserver.py プロジェクト: 0daybug/scrapy
 def render_GET(self, request):
     output = {
         'headers': dict(
             (to_unicode(k), [to_unicode(v) for v in vs])
             for k, vs in request.requestHeaders.getAllRawHeaders()),
         'body': to_unicode(request.content.read()),
     }
     return to_bytes(json.dumps(output))
コード例 #25
0
ファイル: exporters.py プロジェクト: 0daybug/scrapy
 def export_item(self, item):
     if self.first_item:
         self.first_item = False
     else:
         self.file.write(b',\n')
     itemdict = dict(self._get_serialized_fields(item))
     data = self.encoder.encode(itemdict)
     self.file.write(to_bytes(data, self.encoding))
コード例 #26
0
 def browser_open(burl):
     path = urlparse(burl).path
     if not os.path.exists(path):
         path = burl.replace('file://', '')
     with open(path, "rb") as f:
         bbody = f.read()
     self.assertIn(b'<base href="' + to_bytes(url) + b'">', bbody)
     return True
コード例 #27
0
 def file_path(self, request, response=None, info=None):
     url = request.url
     image_guid = hashlib.sha1(to_bytes(url)).hexdigest()
     if self.conf["IMAGES_STORE_FORMAT"] == 'FLAT':
         return '{ig}.jpg'.format(ig=image_guid)
     elif self.conf["IMAGES_STORE_FORMAT"] == 'THUMBS':
         return 'thumbs/{p}/{ig}.jpg'.format(p=next(iter(list(settings.get('IMAGES_THUMBS').keys()))), ig=image_guid)
     else:
         return 'full/{ig}.jpg'.format(ig=image_guid)
コード例 #28
0
ファイル: ftp.py プロジェクト: ArturGaspar/scrapy
 def _failed(self, result, request):
     message = result.getErrorMessage()
     if result.type == CommandFailed:
         m = _CODE_RE.search(message)
         if m:
             ftpcode = m.group()
             httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
             return Response(url=request.url, status=httpcode, body=to_bytes(message))
     raise result.type(result.value)
コード例 #29
0
ファイル: request.py プロジェクト: 01-/scrapy
def request_fingerprint(request, include_headers=None):
    """
    Return the request fingerprint.

    The request fingerprint is a hash that uniquely identifies the resource the
    request points to. For example, take the following two urls:

    http://www.example.com/query?id=111&cat=222
    http://www.example.com/query?cat=222&id=111

    Even though those are two different URLs both point to the same resource
    and are equivalent (ie. they should return the same response).

    Another example are cookies used to store session ids. Suppose the
    following page is only accesible to authenticated users:

    http://www.example.com/members/offers.html

    Lot of sites use a cookie to store the session id, which adds a random
    component to the HTTP Request and thus should be ignored when calculating
    the fingerprint.

    For this reason, request headers are ignored by default when calculating
    the fingeprint. If you want to include specific headers use the
    include_headers argument, which is a list of Request headers to include.

    """
    if include_headers:
        include_headers = tuple(to_bytes(h.lower())
                                 for h in sorted(include_headers))
    cache = _fingerprint_cache.setdefault(request, {})
    if include_headers not in cache:
        fp = hashlib.sha1()
        fp.update(to_bytes(request.method))
        fp.update(to_bytes(canonicalize_url(request.url)))
        fp.update(request.body or b'')
        if include_headers:
            for hdr in include_headers:
                if hdr in request.headers:
                    fp.update(hdr)
                    for v in request.headers.getlist(hdr):
                        fp.update(v)
        cache[include_headers] = fp.hexdigest()
    return cache[include_headers]
コード例 #30
0
ファイル: test_webclient.py プロジェクト: 01-/scrapy
 def test_timeoutNotTriggering(self):
     """
     When a non-zero timeout is passed to L{getPage} and the page is
     retrieved before the timeout period elapses, the L{Deferred} is
     called back with the contents of the page.
     """
     d = getPage(self.getURL("host"), timeout=100)
     d.addCallback(
         self.assertEquals, to_bytes("127.0.0.1:%d" % self.portno))
     return d
コード例 #31
0
def getPage(url,
            contextFactory=None,
            response_transform=None,
            *args,
            **kwargs):
    """Adapted version of twisted.web.client.getPage"""
    def _clientfactory(url, *args, **kwargs):
        url = to_unicode(url)
        timeout = kwargs.pop('timeout', 0)
        f = client.ScrapyHTTPClientFactory(Request(url, *args, **kwargs),
                                           timeout=timeout)
        f.deferred.addCallback(response_transform or (lambda r: r.body))
        return f

    from twisted.web.client import _makeGetterFactory
    return _makeGetterFactory(to_bytes(url),
                              _clientfactory,
                              contextFactory=contextFactory,
                              *args,
                              **kwargs).deferred
コード例 #32
0
ファイル: http11.py プロジェクト: Jz52710/pythonPChong
    def _get_agent(self, request, timeout):
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxyHost = to_unicode(proxyHost)
            omitConnectTunnel = b'noconnect' in proxyParams
            if  scheme == b'https' and not omitConnectTunnel:
                proxyConf = (proxyHost, proxyPort,
                             request.headers.get(b'Proxy-Authorization', None))
                return self._TunnelingAgent(reactor, proxyConf,
                    contextFactory=self._contextFactory, connectTimeout=timeout,
                    bindAddress=bindaddress, pool=self._pool)
            else:
                return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'),
                    connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

        return self._Agent(reactor, contextFactory=self._contextFactory,
            connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
コード例 #33
0
    def process_item(self, item, spider):
        session = self.session()
        processing = Processing(s3_raw=hashlib.sha1(to_bytes(
            item['doc_url'])).hexdigest()).save_to_db(session)

        policy = Policy(country=item['country'],
                        geo_code=item['geo_code'],
                        level=item['level'],
                        source=item['source'],
                        title=item['title'],
                        reference=item['reference'],
                        authorship=item['authorship'],
                        resume=item['resume'],
                        publication_date=item['publication_date'],
                        enforcement_date=item['enforcement_date'],
                        url=item['url'],
                        doc_url=item['doc_url'],
                        doc_name=item['doc_name'],
                        doc_type=item['doc_type'],
                        processing_id=processing.id).save_to_db(session)
コード例 #34
0
ファイル: middlewares.py プロジェクト: Jhingun1/linkedin-1
    def process_request(self, request, spider):
        cookies = spider.cookies
        driver = init_chromium(spider.selenium_hostname, cookies)

        print('SeleniumMiddleware - getting the page')
        driver.get(request.url)

        # request.meta['driver'] = self.driver  # to access driver from response

        print('waiting for page loading')
        spider.wait_page_completion(driver=driver)

        print('SeleniumMiddleware - retrieving body')
        body = to_bytes(driver.page_source)  # body must be of type bytes

        request.meta['driver'] = driver
        return HtmlResponse(driver.current_url,
                            body=body,
                            encoding='utf-8',
                            request=request)
コード例 #35
0
    def file_path(self, request, response=None, info=None):
        """根据request返回存储文件的相对路径"""
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(
                'FilesPipeline.file_key(url) method is deprecated, please use '
                'file_path(request, response=None, info=None) instead',
                category=ScrapyDeprecationWarning,
                stacklevel=1)

        # check if called from file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        media_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # sha1 哈希算法
        media_ext = os.path.splitext(url)[1]
        return 'files/%s%s' % (media_guid, media_ext)
コード例 #36
0
ファイル: response.py プロジェクト: wanglikang/RetardedSpider
def open_in_browser(response, _openfunc=webbrowser.open):
    """Open the given response in a local web browser, populating the <base>
    tag for external links to work
    """
    from scrapy import HtmlResponse, TextResponse
    # XXX: this implementation is a bit dirty and could be improved
    body = response.body
    if isinstance(response, HtmlResponse):
        if b'<base' not in body:
            repl = '<head><base href="%s">' % response.url
            body = body.replace(b'<head>', to_bytes(repl))
        ext = '.html'
    elif isinstance(response, TextResponse):
        ext = '.txt'
    else:
        raise TypeError("Unsupported response type: %s" %
                        response.__class__.__name__)
    fd, fname = tempfile.mkstemp(ext)
    os.write(fd, body)
    os.close(fd)
    return _openfunc("file://%s" % fname)
コード例 #37
0
ファイル: pipelines.py プロジェクト: Flo95x/mba-pipeline
    def file_path(self, request, response=None, info=None, *, item=None):
        # function 2
        img_path = ""
        try:
            # HERE ARE CUSTOM CHANGES
            marketplace = request.meta.get("marketplace")
            asin = request.meta.get("asin")

            img_path = os.path.join(marketplace, asin + ".jpg")
        except:
            media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
            media_ext = os.path.splitext(request.url)[1]
            # Handles empty and wild extensions by trying to guess the
            # mime type then extension or default to empty string otherwise
            if media_ext not in mimetypes.types_map:
                media_ext = ''
                media_type = mimetypes.guess_type(request.url)[0]
                if media_type:
                    media_ext = mimetypes.guess_extension(media_type)
            img_path = f'full/{media_guid}{media_ext}'
        return img_path
コード例 #38
0
def run(output_path, download_workers):
    crawl(download_workers)
    reactor.run()  # the script will block here until the last crawl call is finished
    encoder = ScrapyJSONEncoder()
    with open("results/publications.jl", "rb+") as publications_info_file:
        with open(output_path, "wb+") as output_file:
            output_file.write(b"[")
            first = True
            while True:
                line = publications_info_file.readline()
                if not line:
                    break
                if first:
                    output_file.write(b"\n")
                    first = False
                else:
                    output_file.write(b",\n")
                company_info_data = json.loads(line)
                data = encoder.encode(company_info_data)
                output_file.write(to_bytes(data))
            output_file.write(b"\n]\n")
コード例 #39
0
ファイル: http2.py プロジェクト: wwjiang007/scrapy
    def _get_agent(self, request: Request,
                   timeout: Optional[float]) -> H2Agent:
        from twisted.internet import reactor
        bind_address = request.meta.get('bindaddress') or self._bind_address
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxy_host, proxy_port, proxy_params = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxy_host = proxy_host.decode()
            omit_connect_tunnel = b'noconnect' in proxy_params
            if omit_connect_tunnel:
                warnings.warn(
                    "Using HTTPS proxies in the noconnect mode is not "
                    "supported by the downloader handler. If you use Zyte "
                    "Smart Proxy Manager, it doesn't require this mode "
                    "anymore, so you should update scrapy-crawlera to "
                    "scrapy-zyte-smartproxy and remove '?noconnect' from the "
                    "Zyte Smart Proxy Manager URL.")

            if scheme == b'https' and not omit_connect_tunnel:
                # ToDo
                raise NotImplementedError(
                    'Tunneling via CONNECT method using HTTP/2.0 is not yet supported'
                )
            return self._ProxyAgent(
                reactor=reactor,
                context_factory=self._context_factory,
                proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')),
                connect_timeout=timeout,
                bind_address=bind_address,
                pool=self._pool,
            )

        return self._Agent(
            reactor=reactor,
            context_factory=self._context_factory,
            connect_timeout=timeout,
            bind_address=bind_address,
            pool=self._pool,
        )
コード例 #40
0
    def page_parse(self, response):
        # pagetitle=response.xpath('/html/head/title').extract_first()
        # print("detail parsing:", pagetitle)

        text1 = response.xpath('//*[@class="cp-js"]/p/text()').extract()
        # text2 = response.xpath('//*[@class="wz-box"]/div/p/text()').extract_first()
        item = WineItem()

        item['url'] = response.url
        item['slogan'] = re.sub(r'—*\s+', '', text1[0])  # slogon
        item['name'] = re.sub(r':', '', text1[1])  # 名称
        item['odor_type'] = re.sub(r':', '', text1[2])  # 香型
        item['volume'] = re.sub(r':', '', text1[3])  # 酒精度
        item['net_weight'] = re.sub(r':', '', text1[4])  # 净含量
        item['general_agency'] = re.sub(r':', '', text1[5])  # 总代理
        item['package'] = re.sub(r':', '', text1[6])  # 包装
        item['price'] = ''  # 价格

        item['sku'] = hashlib.sha1(to_bytes(item['name'])).hexdigest()
        item['images_urls_local'] = 'http://192.168.33.55/pics/' + item[
            'sku'] + '/1.jpg'

        #总代理
        # print(re.sub(r':+', '', text1[5]))

        # text2 = response.xpath('//*[@class="wz-box"]/div/p/text()').extract()
        # print(re.sub(r'\s+', '', text2))

        pictureUrls = response.xpath(
            '//*[@class="items"]/ul/li/img/@bimg').extract()

        img_urls = []
        for picture in pictureUrls:
            pictureUrl = request.urljoin(response.url, picture)
            img_urls.append(pictureUrl)

        item['images_urls'] = img_urls

        yield item
コード例 #41
0
ファイル: middlewares.py プロジェクト: hijm/linkedin
    def process_request(self, request, spider):
        driver = spider.driver

        print('SeleniumMiddleware - getting the page')
        driver.get(request.url)

        # request.meta['driver'] = self.driver  # to access driver from response

        print('waiting for page loading')
        profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
        get_by_xpath(driver, profile_xpath)

        # waiting links to other users are shown so the crawl can continue
        get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)

        print('SeleniumMiddleware - retrieving body')
        body = to_bytes(driver.page_source)  # body must be of type bytes

        return HtmlResponse(driver.current_url,
                            body=body,
                            encoding='utf-8',
                            request=request)
コード例 #42
0
  def parse_items(self, response):     
      
      items = []
      item = AquariumScapeItem()
 
      src = response.css('#index_photo img::attr(src)').extract_first()
      tank_img = response.urljoin(src)
      item['image_urls'] = [tank_img]
      image_guid = hashlib.sha1(to_bytes(tank_img)).hexdigest()  # change to request.url after deprecation
      item['image_sha1'] = '%s.jpg' % (image_guid)
      item['user_name'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "reeftank_panel_top", " " ))]//span/text()').extract()
      item['rank'] = response.selector.css('.margin_b+ b::text').extract()
      item['average_category_score'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "clear", " " )) and (((count(preceding-sibling::*) + 1) = 7) and parent::*)]//b/text()').extract()
      item['average_tank_score'] = response.selector.css('.clear:nth-child(5) b::text').extract()
      item['num_ratings'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "clear", " " )) and (((count(preceding-sibling::*) + 1) = 3) and parent::*)]//b/text()').extract()
      item['fish_kept'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "spe-list2", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "right", " " ))]/text()').extract()
      item['plants_kept'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "spe-list1", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "right", " " ))]/text()').extract()
      item['comments'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "image_comment", " " ))]/text()').extract()
      # Return all the found items
      items.append(item)
      return items
      
コード例 #43
0
ファイル: pipelines.py プロジェクト: hejianjun/t66y-spider
    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(
                'ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                'please use file_path(request, response=None, info=None) instead',
                category=ScrapyDeprecationWarning,
                stacklevel=1)

        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() or image_key() methods have been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)
        ## end of deprecation warning block

        image_guid = hashlib.sha1(to_bytes(
            url)).hexdigest()  # change to request.url after deprecation
        folder = request.meta['folder']
        if len(folder):
            folder = folder[0]
            #pdb.set_trace()
            table = dict((ord(char), None) for char in "|\\?*<\":>+[]/'")
            folder = folder.translate(table)
        else:
            folder = u'未知'
        #pdb.set_trace()
        return '%s/%s.jpg' % (folder, image_guid)
コード例 #44
0
ファイル: jsonrpc.py プロジェクト: robd003/scrapy-jsonrpc
def jsonrpc_client_call(url, method, *args, **kwargs):
    """Execute a JSON-RPC call on the given url"""
    if args and kwargs:
        raise ValueError(
            "Pass *args or **kwargs but not both to jsonrpc_client_call")
    req = {
        'jsonrpc': '2.0',
        'method': method,
        'params': args or kwargs,
        'id': 1
    }
    data = to_bytes(json.dumps(req))
    body = urllib.request.urlopen(url, data).read()
    res = json.loads(body.decode('utf-8'))
    if 'result' in res:
        return res['result']
    elif 'error' in res:
        er = res['error']
        raise JsonRpcError(er['code'], er['message'], er['data'])
    else:
        msg = "JSON-RPC response must contain 'result' or 'error': %s" % res
        raise ValueError(msg)
コード例 #45
0
ファイル: exporters.py プロジェクト: apehex/adspying
    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b',')
            self._beautify_newline()


        __item_properties = dict(self._get_serialized_fields(item))

        __item = {
            'type': 'Feature',
            'properties': __item_properties,
            'geometry': {
                'type': 'Point',
                'coordinates': (
                    float(__item_properties.get('longitude', 0.0)),
                    float(__item_properties.get('latitude', 0.0)))}}

        self.file.write(to_bytes(
            self.encoder.encode(__item),
            self.encoding))
コード例 #46
0
    def test_ftp_local_filename(self):
        f, local_fname = tempfile.mkstemp()
        local_fname = to_bytes(local_fname)
        os.close(f)
        meta = {"ftp_local_filename": local_fname}
        meta.update(self.req_meta)
        request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
                          meta=meta)
        d = self.download_handler.download_request(request, None)

        def _test(r):
            self.assertEqual(r.body, local_fname)
            self.assertEqual(r.headers, {
                b'Local Filename': [local_fname],
                b'Size': [b'17']
            })
            self.assertTrue(os.path.exists(local_fname))
            with open(local_fname, "rb") as f:
                self.assertEqual(f.read(), b"I have the power!")
            os.remove(local_fname)

        return self._add_test_callbacks(d, _test)
コード例 #47
0
    def file_path(self, request, response=None, info=None):
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                          'please use file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)

        image_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return 'full/movie250/%s.jpg' % (image_guid)
コード例 #48
0
    def process_item(self, item, spider):
        session = self.session()
        processing = Processing(s3_raw=hashlib.sha1(to_bytes(item['doc_url'])).hexdigest())
        session.add(processing)

        policy = Policy(
            country=item['country'],
            state = item['state'],
            data_source=item['data_source'],
            law_class = item['law_class'],
            title=item['title'],
            reference=item['reference'],
            authorship=item['authorship'],
            resume=item['summary'],
            publication_date=item['publication_date'],
            url=item['url'],
            doc_url=item['doc_url'] + self.API_key,
            doc_name=item['doc_name'],
            processing = processing
        )
        session.merge(policy)
        #print(policy)
        session.commit()
コード例 #49
0
 def _downloadImg(imgUrl):
     savePrefix = "/root/SPIDERIMAGESDB/DATASOURCE/Lofter/" + strDate + "/"
     if not os.path.isdir(savePrefix):
         os.makedirs(savePrefix)
     image_guid = hashlib.sha1(to_bytes(imgUrl)).hexdigest()
     image_name = None
     if ".jpg" in imgUrl:
         image_name = image_guid + ".jpg"
     if ".png" in imgUrl:
         image_name = image_guid + ".png"
     if ".jpeg" in imgUrl:
         image_name = image_guid + ".jpeg"
     if image_name == None:
         return
     try:
         with open(savePrefix+image_name, "wb") as writer:
             writer.write(urllib.request.urlopen(imgUrl).read())
     except Exception as es:
         print(es)
     with codecs.open("./download" + strDate + ".csv", mode='a', encoding='utf8') as writer:
         writer.write(imgUrl+',Lofter/'+strDate+'/'+image_name)
         writer.write('\n')
         print(imgUrl, image_name)
コード例 #50
0
    def process_request(self, request, spider):
        self.driver.get(request.url)
        if spider.name == 'wanted' and request.meta != {}:
            time.sleep(2)
            some_tag = self.driver.find_element_by_class_name(
                '_3h_f6TfissC0l7ogPcn7lY')
            action = ActionChains(self.driver)
            action.move_to_element(some_tag).perform()
            time.sleep(1)
        else:
            time.sleep(1.2)
        if spider.name == 'wanted' and request.meta == {}:
            self.infinite_scroll_control()
        elif spider.name == 'roketpunch' and 'job_card_company' in request.meta:
            self.see_more_button_click()
        elif spider.name == 'naver' and request.meta == {}:
            self.naver_more_button()

        body = to_bytes(text=self.driver.page_source)
        return HtmlResponse(url=request.url,
                            body=body,
                            encoding='utf-8',
                            request=request)
コード例 #51
0
    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(
                'ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                'please use file_path(request, response=None, info=None) instead',
                category=ScrapyDeprecationWarning,
                stacklevel=1)

        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() or image_key() methods have been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)
        ## end of deprecation warning block

        image_guid = hashlib.sha1(to_bytes(
            url)).hexdigest()  # change to request.url after deprecation

        # 重新此方法,如果 item 中含有 save_sub_dir, 则将其放入此子文件夹
        item = request.meta['item']
        if 'save_sub_dir' in item:
            path = '%s/full/%s.jpg' % (item['save_sub_dir'], image_guid)
        else:
            path = 'full/%s.jpg' % image_guid
        return path
コード例 #52
0
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        if spider.browser:
            request.meta[
                'browser'] = self.browser  # to access driver from response
            self.browser.get(request.url)
            # wait js eval
            time.sleep(15)
            body = to_bytes(
                self.browser.page_source)  # body must be of type bytes
            return HtmlResponse(self.browser.current_url,
                                body=body,
                                encoding='utf-8',
                                request=request)
        else:
            return None
コード例 #53
0
    def parse_page(self, response):
        item = MeituItem()
        item['referer'] = response.meta['item']['referer']
        href = response.xpath("//div[@class='content-pic']/a/img/@src").extract_first()
        image_name = response.xpath("//div[@class='content']/h5/text()").extract_first()
        item['image_name'] = image_name.split('(')[0]
        item['image_hash'] = hashlib.sha1(to_bytes(item['image_name'])).hexdigest()
        try:
            item['image_num'] = image_name.split('(')[1].split(')')[0].zfill(2)
        except IndexError:
            item['image_num'] = '01'
        item['image_urls'] = [href]
        item['spider_name'] = self.name
        yield item

        # next page
        next_href = response.xpath("//a[text()='下一页']/@href").extract_first()
        if next_href:
            next_href = urljoin(response.url, next_href)
            yield scrapy.Request(
                url=next_href,
                callback=self.parse_page,
                meta={'item': {'referer': response.url}}
            )
コード例 #54
0
ファイル: middlewares.py プロジェクト: ridhish10/linkedin
    def process_request(self, request, spider):
        driver = spider.driver

        logger.info('SeleniumMiddleware - getting the page')
        driver.get(request.url)

        logger.info('SeleniumMiddleware - click more options')
        more_option = get_by_xpath(
            driver, '//div/div/button[@class="more-options dropdown-caret"]')
        more_option.send_keys(Keys.NULL)
        more_option.click()

        logger.info('SeleniumMiddleware - wait for names')
        name = get_by_xpath(driver, '//ul[@class="browse-map-list"]/li/h4/a')
        name.send_keys(Keys.NULL)

        #request.meta['driver'] = self.driver  # to access driver from response

        logging.info('SeleniumMiddleware - retrieving body')
        body = to_bytes(driver.page_source)  # body must be of type bytes
        return HtmlResponse(driver.current_url,
                            body=body,
                            encoding='utf-8',
                            request=request)
コード例 #55
0
ファイル: httpcache.py プロジェクト: zhangtao1987/scrapy
 def store_response(self, spider, request, response):
     """Store the given response in the cache."""
     rpath = self._get_request_path(spider, request)
     if not os.path.exists(rpath):
         os.makedirs(rpath)
     metadata = {
         'url': request.url,
         'method': request.method,
         'status': response.status,
         'response_url': response.url,
         'timestamp': time(),
     }
     with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
         f.write(to_bytes(repr(metadata)))
     with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
         pickle.dump(metadata, f, protocol=2)
     with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
         f.write(headers_dict_to_raw(response.headers))
     with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
         f.write(response.body)
     with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
         f.write(headers_dict_to_raw(request.headers))
     with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
         f.write(request.body)
コード例 #56
0
 def store_response(self, spider, request, response):
     """Store the given response in the cache."""
     rpath = self._get_request_path(spider, request)
     if not os.path.exists(rpath):
         os.makedirs(rpath)
     metadata = {
         "url": request.url,
         "method": request.method,
         "status": response.status,
         "response_url": response.url,
         "timestamp": time(),
     }
     with self._open(os.path.join(rpath, "meta"), "wb") as f:
         f.write(to_bytes(repr(metadata)))
     with self._open(os.path.join(rpath, "pickled_meta"), "wb") as f:
         pickle.dump(metadata, f, protocol=4)
     with self._open(os.path.join(rpath, "response_headers"), "wb") as f:
         f.write(headers_dict_to_raw(response.headers))
     with self._open(os.path.join(rpath, "response_body"), "wb") as f:
         f.write(response.body)
     with self._open(os.path.join(rpath, "request_headers"), "wb") as f:
         f.write(headers_dict_to_raw(request.headers))
     with self._open(os.path.join(rpath, "request_body"), "wb") as f:
         f.write(request.body)
コード例 #57
0
    def process_item(self, item, spider):
        '''
        图片下载
        :param item:
        :param spider:
        :return:
        '''
        for image in item['image_urls'].split(','):
            #图片命名
            image_guid = hashlib.sha1(to_bytes(image)).hexdigest()
            image_name = '%s.jpg' % (image_guid)

            house_id = item['from_url'].split('/')[-1].replace('.html', '')
            file_path = '%s/%s' % (self.imagepath, house_id)

            if not os.path.exists(file_path):
                os.makedirs(file_path)

            image_path = '%s/%s/%s' % (self.imagepath, house_id, image_name)

            if not os.path.exists(image_path):
                urlretrieve(image, image_path)
            else:
                raise DropItem('It exists!')
コード例 #58
0
ファイル: pipelines.py プロジェクト: liuguiyangnwpu/crawlImgs
    def file_path(self, request, response=None, info=None):
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(
                'ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                'please use file_path(request, response=None, info=None) instead',
                category=ScrapyDeprecationWarning,
                stacklevel=1)

        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url
        image_guid = hashlib.sha1(to_bytes(
            url)).hexdigest()  # change to request.url after deprecation
        word = str(request.meta['word']).split('word=')[-1]
        try:
            word = urllib.unquote(word).decode('utf-8')
        except Exception as ex:
            word = urllib.parse.unquote(word)
        print(word + "/%s.jpg" % image_guid)
        return word + '/%s.jpg' % (image_guid)
コード例 #59
0
ファイル: images.py プロジェクト: PeterLUYP/2016YCProject
    def thumb_path(self, request, thumb_id, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.thumb_key(url) method is deprecated, please use '
                          'thumb_path(request, thumb_id, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from thumb_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if thumb_key() method has been overridden
        if not hasattr(self.thumb_key, '_base'):
            _warn()
            return self.thumb_key(url, thumb_id)
        ## end of deprecation warning block

        thumb_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
コード例 #60
0
    def file_path(self, request, response=None, info=None):
        start_time = self._get_start_time(info.spider)
        start_time_str = start_time.strftime('%Y%m%d_%H%M%S')
        content_type = ''
        if response:
            # This is to cover the case when the url has . after the last /
            # and the text after the . is not a file extension but the response is a json
            content_type = to_native_str(response.headers['Content-Type'])
        url = request.url
        media_guid = hashlib.sha1(to_bytes(url)).hexdigest()
        media_ext = os.path.splitext(url)[1]

        if hasattr(info.spider, 'ext'):
            media_ext = info.spider.ext
        elif not media_ext or ('json' in content_type
                               and media_ext != '.json'):
            media_ext = '.json'
        # Put files in a directory named after the scraper they came from, and the scraper starttime
        if hasattr(info.spider, 'sample') and info.spider.sample == 'true':
            return '%s_sample/%s/%s%s' % (info.spider.name, start_time_str,
                                          media_guid, media_ext)
        else:
            return '%s/%s/%s%s' % (info.spider.name, start_time_str,
                                   media_guid, media_ext)