Python to_bytes Examples, scrapy.utils.python.to_bytes Python Examples

Example #1

0

Show file

File: webclient.py Project: 390218462/scrapy

    def __init__(self, request, timeout=180):
        self._url = urldefrag(request.url)[0]
        # converting to bytes to comply to Twisted interface
        self.url = to_bytes(self._url, encoding='ascii')
        self.method = to_bytes(request.method, encoding='ascii')
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.timeout = request.meta.get('download_timeout') or timeout
        self.start_time = time()
        self.deferred = defer.Deferred().addCallback(self._build_response, request)

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Scrapy implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault("Connection", "close")
        # Content-Length must be specified in POST method even with no body
        elif self.method == b'POST':
            self.headers['Content-Length'] = 0

Example #2

0

Show file

File: test_downloadermiddleware_defaultheaders.py Project: ArturGaspar/scrapy

 def get_defaults_spider_mw(self):
     crawler = get_crawler(Spider)
     spider = crawler._create_spider('foo')
     defaults = {
         to_bytes(k): [to_bytes(v)]
         for k, v in crawler.settings.get('DEFAULT_REQUEST_HEADERS').items()
     }
     return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)

Example #3

0

Show file

File: test_webclient.py Project: 01-/scrapy

 def testHostHeader(self):
     # if we pass Host header explicitly, it should be used, otherwise
     # it should extract from url
     return defer.gatherResults([
         getPage(self.getURL("host")).addCallback(
             self.assertEquals, to_bytes("127.0.0.1:%d" % self.portno)),
         getPage(self.getURL("host"), headers={"Host": "www.example.com"}).addCallback(
             self.assertEquals, to_bytes("www.example.com"))])

Example #4

0

Show file

File: mockserver.py Project: 0daybug/scrapy

 def renderRequest(self, request, nlist):
     s = """<html> <head></head> <body>"""
     args = request.args.copy()
     for nl in nlist:
         args[b"n"] = [to_bytes(str(nl))]
         argstr = urlencode(args, doseq=True)
         s += "<a href='/follow?%s'>follow %d</a><br>" % (argstr, nl)
     s += """</body>"""
     request.write(to_bytes(s))
     request.finish()

Example #5

0

Show file

File: response.py Project: ArturGaspar/scrapy

def response_httprepr(response):
    """Return raw HTTP representation (as bytes) of the given response. This
    is provided only for reference, since it's not the exact stream of bytes
    that was received (that's not exposed by Twisted).
    """
    s = b"HTTP/1.1 " + to_bytes(str(response.status)) + b" " + \
        to_bytes(http.RESPONSES.get(response.status, b'')) + b"\r\n"
    if response.headers:
        s += response.headers.to_string() + b"\r\n"
    s += b"\r\n"
    s += response.body
    return s

Example #6

0

Show file

File: url.py Project: 447327642/scrapy

def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
    return (
        to_native_str(parts.scheme),
        to_native_str(parts.netloc.encode('idna')),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),
        quote(to_bytes(parts.params, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars)
    )

Example #7

0

Show file

File: request.py Project: nguyenhongson03/scrapy

def request_httprepr(request):
    """Return the raw HTTP representation (as bytes) of the given request.
    This is provided only for reference since it's not the actual stream of
    bytes that will be send when performing the request (that's controlled
    by Twisted).
    """
    parsed = urlparse_cached(request)
    path = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, ""))
    s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
    s += b"Host: " + to_bytes(parsed.hostname) + b"\r\n"
    if request.headers:
        s += request.headers.to_string() + b"\r\n"
    s += b"\r\n"
    s += request.body
    return s

Example #8

0

Show file

File: http_util.py Project: isyes/scrapy_enterprise_architecture

def multpart_encode(form_data):
    _boundary = to_bytes(''.join(
        random.choice(string.digits + string.ascii_letters) for i in range(20)))
    content_type = "multipart/form-data; boundary=" + _boundary

    body = []
    for name, value in form_data.items():
        body.append(b'--' + _boundary)
        body.append(b'Content-Disposition: form-data; name="' + to_bytes(name) + b'"')
        body.append(b'')
        body.append(to_bytes(value))

    body.append(b'--' + _boundary + b'--')

    return content_type, b'\r\n'.join(body)

Example #9

0

Show file

File: ftp.py Project: ArturGaspar/scrapy

 def _build_response(self, result, request, protocol):
     self.result = result
     respcls = responsetypes.from_args(url=request.url)
     protocol.close()
     body = protocol.filename or protocol.body.read()
     headers = {"local filename": protocol.filename or '', "size": protocol.size}
     return respcls(url=request.url, status=200, body=to_bytes(body), headers=headers)

Example #10

0

Show file

File: pipelines.py Project: Why-Not-Sky/django-dynamic-scraper

 def thumb_path(self, request, thumb_id, response=None, info=None):
     url = request.url
     image_guid = hashlib.sha1(to_bytes(url)).hexdigest()
     if self.conf["IMAGES_STORE_FORMAT"] == 'FLAT':
         return '{ig}.jpg'.format(ig=image_guid)
     else:
         return 'thumbs/{p}/{ig}.jpg'.format(p=thumb_id, ig=image_guid)

Example #11

0

Show file

File: images.py Project: GregoryVigoTorres/scrapy

    def thumb_path(self, request, thumb_id, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings

            warnings.warn(
                "ImagesPipeline.thumb_key(url) method is deprecated, please use "
                "thumb_path(request, thumb_id, response=None, info=None) instead",
                category=ScrapyDeprecationWarning,
                stacklevel=1,
            )

        # check if called from thumb_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if thumb_key() method has been overridden
        if not hasattr(self.thumb_key, "_base"):
            _warn()
            return self.thumb_key(url, thumb_id)
        ## end of deprecation warning block

        thumb_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return "thumbs/%s/%s.jpg" % (thumb_id, thumb_guid)

Example #12

0

Show file

File: url.py Project: naisanza/scrapy

def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    if isinstance(url, ParseResult):
        return url
    return urlparse(to_bytes(url, encoding))

Example #13

0

Show file

File: http11.py Project: bedreamer/scrapy

 def requestTunnel(self, protocol):
     """Asks the proxy to open a tunnel."""
     tunnelReq = (
         b'CONNECT ' +
         to_bytes(self._tunneledHost, encoding='ascii') + b':' +
         to_bytes(str(self._tunneledPort)) +
         b' HTTP/1.1\r\n')
     if self._proxyAuthHeader:
         tunnelReq += \
             b'Proxy-Authorization: ' + self._proxyAuthHeader + b'\r\n'
     tunnelReq += b'\r\n'
     protocol.transport.write(tunnelReq)
     self._protocolDataReceived = protocol.dataReceived
     protocol.dataReceived = self.processProxyResponse
     self._protocol = protocol
     return protocol

Example #14

0

Show file

File: test_http_request.py Project: kuailehaibin/scrapy

 def _test_request(self, **kwargs):
     r = self.request_class('http://scrapytest.org/rpc2', **kwargs)
     self.assertEqual(r.headers[b'Content-Type'], b'text/xml')
     self.assertEqual(r.body, to_bytes(xmlrpclib.dumps(**kwargs)))
     self.assertEqual(r.method, 'POST')
     self.assertEqual(r.encoding, kwargs.get('encoding', 'utf-8'))
     self.assertTrue(r.dont_filter, True)

Example #15

0

Show file

File: exporters.py Project: ericmoritz/scrapy-music

    def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        if hasattr(item, '_context'):
            itemdict['@context'] = item._context

        data = self.encoder.encode(itemdict) + '\n'
        self.file.write(to_bytes(data, self.encoding))

Example #16

0

Show file

File: test_webclient.py Project: 01-/scrapy

    def testParse(self):
        lip = '127.0.0.1'
        tests = (
    ("http://127.0.0.1?c=v&c2=v2#fragment",     ('http', lip, lip, 80, '/?c=v&c2=v2')),
    ("http://127.0.0.1/?c=v&c2=v2#fragment",    ('http', lip, lip, 80, '/?c=v&c2=v2')),
    ("http://127.0.0.1/foo?c=v&c2=v2#frag",     ('http', lip, lip, 80, '/foo?c=v&c2=v2')),
    ("http://127.0.0.1:100?c=v&c2=v2#fragment", ('http', lip+':100', lip, 100, '/?c=v&c2=v2')),
    ("http://127.0.0.1:100/?c=v&c2=v2#frag",    ('http', lip+':100', lip, 100, '/?c=v&c2=v2')),
    ("http://127.0.0.1:100/foo?c=v&c2=v2#frag", ('http', lip+':100', lip, 100, '/foo?c=v&c2=v2')),

    ("http://127.0.0.1",              ('http', lip, lip, 80, '/')),
    ("http://127.0.0.1/",             ('http', lip, lip, 80, '/')),
    ("http://127.0.0.1/foo",          ('http', lip, lip, 80, '/foo')),
    ("http://127.0.0.1?param=value",  ('http', lip, lip, 80, '/?param=value')),
    ("http://127.0.0.1/?param=value", ('http', lip, lip, 80, '/?param=value')),
    ("http://127.0.0.1:12345/foo",    ('http', lip+':12345', lip, 12345, '/foo')),
    ("http://spam:12345/foo",         ('http', 'spam:12345', 'spam', 12345, '/foo')),
    ("http://spam.test.org/foo",      ('http', 'spam.test.org', 'spam.test.org', 80, '/foo')),

    ("https://127.0.0.1/foo",         ('https', lip, lip, 443, '/foo')),
    ("https://127.0.0.1/?param=value", ('https', lip, lip, 443, '/?param=value')),
    ("https://127.0.0.1:12345/",      ('https', lip+':12345', lip, 12345, '/')),

    ("http://scrapytest.org/foo ",    ('http', 'scrapytest.org', 'scrapytest.org', 80, '/foo')),
    ("http://egg:7890 ",              ('http', 'egg:7890', 'egg', 7890, '/')),
    )

        for url, test in tests:
            test = tuple(
                to_bytes(x) if not isinstance(x, int) else x for x in test)
            self.assertEquals(client._parse(url), test, url)

Example #17

0

Show file

File: images.py Project: ArturGaspar/scrapy

    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                          'please use file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() or image_key() methods have been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)
        ## end of deprecation warning block

        image_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return 'full/%s.jpg' % (image_guid)

Example #18

0

Show file

File: files.py Project: CPoirot3/scrapy

    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
                          'file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() method has been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        ## end of deprecation warning block

        media_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        media_ext = os.path.splitext(url)[1]  # change to request.url after deprecation
        return 'full/%s%s' % (media_guid, media_ext)

Example #19

0

Show file

File: exporters.py Project: voith/scrapy

 def export_item(self, item):
     if self.first_item:
         self.first_item = False
     else:
         self.file.write(b",\n")
     itemdict = dict(self._get_serialized_fields(item))
     self.file.write(to_bytes(self.encoder.encode(itemdict)))

Example #20

0

Show file

File: utils.py Project: yilunanxia/scrapy-splash

 def to_native_str(text, encoding=None, errors='strict'):
     """ Return str representation of `text`
     (bytes in Python 2.x and unicode in Python 3.x). """
     if six.PY2:
         return to_bytes(text, encoding, errors)
     else:
         return to_unicode(text, encoding, errors)

Example #21

0

Show file

File: test_downloadermiddleware.py Project: 01-/scrapy

    def test_3xx_and_invalid_gzipped_body_must_redirect(self):
        """Regression test for a failure when redirecting a compressed
        request.

        This happens when httpcompression middleware is executed before redirect
        middleware and attempts to decompress a non-compressed body.
        In particular when some website returns a 30x response with header
        'Content-Encoding: gzip' giving as result the error below:

            exceptions.IOError: Not a gzipped file

        """
        req = Request('http://example.com')
        body = b'<p>You are being redirected</p>'
        resp = Response(req.url, status=302, body=body, headers={
            'Content-Length': str(len(body)),
            'Content-Type': 'text/html',
            'Content-Encoding': 'gzip',
            'Location': 'http://example.com/login',
        })
        ret = self._download(request=req, response=resp)
        self.assertTrue(isinstance(ret, Request),
                        "Not redirected: {0!r}".format(ret))
        self.assertEqual(to_bytes(ret.url), resp.headers['Location'],
                         "Not redirected to location header")

Example #22

0

Show file

File: test_deltafetch.py Project: nyov/scrapylib

 def test_get_key(self):
     mw = self.mwcls(self.temp_dir, reset=True)
     test_req1 = Request('http://url1')
     self.assertEqual(mw._get_key(test_req1),
                      to_bytes(request_fingerprint(test_req1)))
     test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'})
     self.assertEqual(mw._get_key(test_req2), b'dfkey1')

Example #23

0

Show file

File: line_receiver.py Project: aron-bordin/scrapy-streaming

 def writeLine(self, data):
     """
     Write the data to the process stdin, adding the new-line delimiter if necessary
     """
     data = to_bytes(data)
     if not data.endswith(b'\n'):
         data += self.__delimiter
     self.transport.write(data)

Example #24

0

Show file

File: mockserver.py Project: 0daybug/scrapy

 def render_GET(self, request):
     output = {
         'headers': dict(
             (to_unicode(k), [to_unicode(v) for v in vs])
             for k, vs in request.requestHeaders.getAllRawHeaders()),
         'body': to_unicode(request.content.read()),
     }
     return to_bytes(json.dumps(output))

Example #25

0

Show file

File: exporters.py Project: 0daybug/scrapy

 def export_item(self, item):
     if self.first_item:
         self.first_item = False
     else:
         self.file.write(b',\n')
     itemdict = dict(self._get_serialized_fields(item))
     data = self.encoder.encode(itemdict)
     self.file.write(to_bytes(data, self.encoding))

Example #26

0

Show file

File: test_utils_response.py Project: 390218462/scrapy

 def browser_open(burl):
     path = urlparse(burl).path
     if not os.path.exists(path):
         path = burl.replace('file://', '')
     with open(path, "rb") as f:
         bbody = f.read()
     self.assertIn(b'<base href="' + to_bytes(url) + b'">', bbody)
     return True

Example #27

0

Show file

File: pipelines.py Project: Why-Not-Sky/django-dynamic-scraper

 def file_path(self, request, response=None, info=None):
     url = request.url
     image_guid = hashlib.sha1(to_bytes(url)).hexdigest()
     if self.conf["IMAGES_STORE_FORMAT"] == 'FLAT':
         return '{ig}.jpg'.format(ig=image_guid)
     elif self.conf["IMAGES_STORE_FORMAT"] == 'THUMBS':
         return 'thumbs/{p}/{ig}.jpg'.format(p=next(iter(list(settings.get('IMAGES_THUMBS').keys()))), ig=image_guid)
     else:
         return 'full/{ig}.jpg'.format(ig=image_guid)

Example #28

0

Show file

File: ftp.py Project: ArturGaspar/scrapy

 def _failed(self, result, request):
     message = result.getErrorMessage()
     if result.type == CommandFailed:
         m = _CODE_RE.search(message)
         if m:
             ftpcode = m.group()
             httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
             return Response(url=request.url, status=httpcode, body=to_bytes(message))
     raise result.type(result.value)

Example #29

0

Show file

File: request.py Project: 01-/scrapy

def request_fingerprint(request, include_headers=None):
    """
    Return the request fingerprint.

    The request fingerprint is a hash that uniquely identifies the resource the
    request points to. For example, take the following two urls:

    http://www.example.com/query?id=111&cat=222
    http://www.example.com/query?cat=222&id=111

    Even though those are two different URLs both point to the same resource
    and are equivalent (ie. they should return the same response).

    Another example are cookies used to store session ids. Suppose the
    following page is only accesible to authenticated users:

    http://www.example.com/members/offers.html

    Lot of sites use a cookie to store the session id, which adds a random
    component to the HTTP Request and thus should be ignored when calculating
    the fingerprint.

    For this reason, request headers are ignored by default when calculating
    the fingeprint. If you want to include specific headers use the
    include_headers argument, which is a list of Request headers to include.

    """
    if include_headers:
        include_headers = tuple(to_bytes(h.lower())
                                 for h in sorted(include_headers))
    cache = _fingerprint_cache.setdefault(request, {})
    if include_headers not in cache:
        fp = hashlib.sha1()
        fp.update(to_bytes(request.method))
        fp.update(to_bytes(canonicalize_url(request.url)))
        fp.update(request.body or b'')
        if include_headers:
            for hdr in include_headers:
                if hdr in request.headers:
                    fp.update(hdr)
                    for v in request.headers.getlist(hdr):
                        fp.update(v)
        cache[include_headers] = fp.hexdigest()
    return cache[include_headers]

Example #30

0

Show file

File: test_webclient.py Project: 01-/scrapy

 def test_timeoutNotTriggering(self):
     """
     When a non-zero timeout is passed to L{getPage} and the page is
     retrieved before the timeout period elapses, the L{Deferred} is
     called back with the contents of the page.
     """
     d = getPage(self.getURL("host"), timeout=100)
     d.addCallback(
         self.assertEquals, to_bytes("127.0.0.1:%d" % self.portno))
     return d

Example #31

0

Show file

File: test_webclient.py Project: shantikumar/Web-Scraper

def getPage(url,
            contextFactory=None,
            response_transform=None,
            *args,
            **kwargs):
    """Adapted version of twisted.web.client.getPage"""
    def _clientfactory(url, *args, **kwargs):
        url = to_unicode(url)
        timeout = kwargs.pop('timeout', 0)
        f = client.ScrapyHTTPClientFactory(Request(url, *args, **kwargs),
                                           timeout=timeout)
        f.deferred.addCallback(response_transform or (lambda r: r.body))
        return f

    from twisted.web.client import _makeGetterFactory
    return _makeGetterFactory(to_bytes(url),
                              _clientfactory,
                              contextFactory=contextFactory,
                              *args,
                              **kwargs).deferred

Example #32

0

Show file

File: http11.py Project: Jz52710/pythonPChong

    def _get_agent(self, request, timeout):
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxyHost = to_unicode(proxyHost)
            omitConnectTunnel = b'noconnect' in proxyParams
            if  scheme == b'https' and not omitConnectTunnel:
                proxyConf = (proxyHost, proxyPort,
                             request.headers.get(b'Proxy-Authorization', None))
                return self._TunnelingAgent(reactor, proxyConf,
                    contextFactory=self._contextFactory, connectTimeout=timeout,
                    bindAddress=bindaddress, pool=self._pool)
            else:
                return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'),
                    connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

        return self._Agent(reactor, contextFactory=self._contextFactory,
            connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

Example #33

0

Show file

    def process_item(self, item, spider):
        session = self.session()
        processing = Processing(s3_raw=hashlib.sha1(to_bytes(
            item['doc_url'])).hexdigest()).save_to_db(session)

        policy = Policy(country=item['country'],
                        geo_code=item['geo_code'],
                        level=item['level'],
                        source=item['source'],
                        title=item['title'],
                        reference=item['reference'],
                        authorship=item['authorship'],
                        resume=item['resume'],
                        publication_date=item['publication_date'],
                        enforcement_date=item['enforcement_date'],
                        url=item['url'],
                        doc_url=item['doc_url'],
                        doc_name=item['doc_name'],
                        doc_type=item['doc_type'],
                        processing_id=processing.id).save_to_db(session)

Example #34

0

Show file

File: middlewares.py Project: Jhingun1/linkedin-1

    def process_request(self, request, spider):
        cookies = spider.cookies
        driver = init_chromium(spider.selenium_hostname, cookies)

        print('SeleniumMiddleware - getting the page')
        driver.get(request.url)

        # request.meta['driver'] = self.driver  # to access driver from response

        print('waiting for page loading')
        spider.wait_page_completion(driver=driver)

        print('SeleniumMiddleware - retrieving body')
        body = to_bytes(driver.page_source)  # body must be of type bytes

        request.meta['driver'] = driver
        return HtmlResponse(driver.current_url,
                            body=body,
                            encoding='utf-8',
                            request=request)

Example #35

0

Show file

    def file_path(self, request, response=None, info=None):
        """根据request返回存储文件的相对路径"""
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(
                'FilesPipeline.file_key(url) method is deprecated, please use '
                'file_path(request, response=None, info=None) instead',
                category=ScrapyDeprecationWarning,
                stacklevel=1)

        # check if called from file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        media_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # sha1 哈希算法
        media_ext = os.path.splitext(url)[1]
        return 'files/%s%s' % (media_guid, media_ext)

Example #36

0

Show file

File: response.py Project: wanglikang/RetardedSpider

def open_in_browser(response, _openfunc=webbrowser.open):
    """Open the given response in a local web browser, populating the <base>
    tag for external links to work
    """
    from scrapy import HtmlResponse, TextResponse
    # XXX: this implementation is a bit dirty and could be improved
    body = response.body
    if isinstance(response, HtmlResponse):
        if b'<base' not in body:
            repl = '<head><base href="%s">' % response.url
            body = body.replace(b'<head>', to_bytes(repl))
        ext = '.html'
    elif isinstance(response, TextResponse):
        ext = '.txt'
    else:
        raise TypeError("Unsupported response type: %s" %
                        response.__class__.__name__)
    fd, fname = tempfile.mkstemp(ext)
    os.write(fd, body)
    os.close(fd)
    return _openfunc("file://%s" % fname)

Example #37

0

Show file

File: pipelines.py Project: Flo95x/mba-pipeline

    def file_path(self, request, response=None, info=None, *, item=None):
        # function 2
        img_path = ""
        try:
            # HERE ARE CUSTOM CHANGES
            marketplace = request.meta.get("marketplace")
            asin = request.meta.get("asin")

            img_path = os.path.join(marketplace, asin + ".jpg")
        except:
            media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
            media_ext = os.path.splitext(request.url)[1]
            # Handles empty and wild extensions by trying to guess the
            # mime type then extension or default to empty string otherwise
            if media_ext not in mimetypes.types_map:
                media_ext = ''
                media_type = mimetypes.guess_type(request.url)[0]
                if media_type:
                    media_ext = mimetypes.guess_extension(media_type)
            img_path = f'full/{media_guid}{media_ext}'
        return img_path

Example #38

0

Show file

def run(output_path, download_workers):
    crawl(download_workers)
    reactor.run()  # the script will block here until the last crawl call is finished
    encoder = ScrapyJSONEncoder()
    with open("results/publications.jl", "rb+") as publications_info_file:
        with open(output_path, "wb+") as output_file:
            output_file.write(b"[")
            first = True
            while True:
                line = publications_info_file.readline()
                if not line:
                    break
                if first:
                    output_file.write(b"\n")
                    first = False
                else:
                    output_file.write(b",\n")
                company_info_data = json.loads(line)
                data = encoder.encode(company_info_data)
                output_file.write(to_bytes(data))
            output_file.write(b"\n]\n")

Example #39

0

Show file

File: http2.py Project: wwjiang007/scrapy

    def _get_agent(self, request: Request,
                   timeout: Optional[float]) -> H2Agent:
        from twisted.internet import reactor
        bind_address = request.meta.get('bindaddress') or self._bind_address
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxy_host, proxy_port, proxy_params = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxy_host = proxy_host.decode()
            omit_connect_tunnel = b'noconnect' in proxy_params
            if omit_connect_tunnel:
                warnings.warn(
                    "Using HTTPS proxies in the noconnect mode is not "
                    "supported by the downloader handler. If you use Zyte "
                    "Smart Proxy Manager, it doesn't require this mode "
                    "anymore, so you should update scrapy-crawlera to "
                    "scrapy-zyte-smartproxy and remove '?noconnect' from the "
                    "Zyte Smart Proxy Manager URL.")

            if scheme == b'https' and not omit_connect_tunnel:
                # ToDo
                raise NotImplementedError(
                    'Tunneling via CONNECT method using HTTP/2.0 is not yet supported'
                )
            return self._ProxyAgent(
                reactor=reactor,
                context_factory=self._context_factory,
                proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')),
                connect_timeout=timeout,
                bind_address=bind_address,
                pool=self._pool,
            )

        return self._Agent(
            reactor=reactor,
            context_factory=self._context_factory,
            connect_timeout=timeout,
            bind_address=bind_address,
            pool=self._pool,
        )

Example #40

0

Show file

    def page_parse(self, response):
        # pagetitle=response.xpath('/html/head/title').extract_first()
        # print("detail parsing:", pagetitle)

        text1 = response.xpath('//*[@class="cp-js"]/p/text()').extract()
        # text2 = response.xpath('//*[@class="wz-box"]/div/p/text()').extract_first()
        item = WineItem()

        item['url'] = response.url
        item['slogan'] = re.sub(r'—*\s+', '', text1[0])  # slogon
        item['name'] = re.sub(r'：', '', text1[1])  # 名称
        item['odor_type'] = re.sub(r'：', '', text1[2])  # 香型
        item['volume'] = re.sub(r'：', '', text1[3])  # 酒精度
        item['net_weight'] = re.sub(r'：', '', text1[4])  # 净含量
        item['general_agency'] = re.sub(r'：', '', text1[5])  # 总代理
        item['package'] = re.sub(r'：', '', text1[6])  # 包装
        item['price'] = ''  # 价格

        item['sku'] = hashlib.sha1(to_bytes(item['name'])).hexdigest()
        item['images_urls_local'] = 'http://192.168.33.55/pics/' + item[
            'sku'] + '/1.jpg'

        #总代理
        # print(re.sub(r'：+', '', text1[5]))

        # text2 = response.xpath('//*[@class="wz-box"]/div/p/text()').extract()
        # print(re.sub(r'\s+', '', text2))

        pictureUrls = response.xpath(
            '//*[@class="items"]/ul/li/img/@bimg').extract()

        img_urls = []
        for picture in pictureUrls:
            pictureUrl = request.urljoin(response.url, picture)
            img_urls.append(pictureUrl)

        item['images_urls'] = img_urls

        yield item

Example #41

0

Show file

File: middlewares.py Project: hijm/linkedin

    def process_request(self, request, spider):
        driver = spider.driver

        print('SeleniumMiddleware - getting the page')
        driver.get(request.url)

        # request.meta['driver'] = self.driver  # to access driver from response

        print('waiting for page loading')
        profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
        get_by_xpath(driver, profile_xpath)

        # waiting links to other users are shown so the crawl can continue
        get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)

        print('SeleniumMiddleware - retrieving body')
        body = to_bytes(driver.page_source)  # body must be of type bytes

        return HtmlResponse(driver.current_url,
                            body=body,
                            encoding='utf-8',
                            request=request)

Example #42

0

Show file

  def parse_items(self, response):     
      
      items = []
      item = AquariumScapeItem()
 
      src = response.css('#index_photo img::attr(src)').extract_first()
      tank_img = response.urljoin(src)
      item['image_urls'] = [tank_img]
      image_guid = hashlib.sha1(to_bytes(tank_img)).hexdigest()  # change to request.url after deprecation
      item['image_sha1'] = '%s.jpg' % (image_guid)
      item['user_name'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "reeftank_panel_top", " " ))]//span/text()').extract()
      item['rank'] = response.selector.css('.margin_b+ b::text').extract()
      item['average_category_score'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "clear", " " )) and (((count(preceding-sibling::*) + 1) = 7) and parent::*)]//b/text()').extract()
      item['average_tank_score'] = response.selector.css('.clear:nth-child(5) b::text').extract()
      item['num_ratings'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "clear", " " )) and (((count(preceding-sibling::*) + 1) = 3) and parent::*)]//b/text()').extract()
      item['fish_kept'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "spe-list2", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "right", " " ))]/text()').extract()
      item['plants_kept'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "spe-list1", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "right", " " ))]/text()').extract()
      item['comments'] = response.selector.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "image_comment", " " ))]/text()').extract()
      # Return all the found items
      items.append(item)
      return items

Example #43

0

Show file

File: pipelines.py Project: hejianjun/t66y-spider

    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(
                'ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                'please use file_path(request, response=None, info=None) instead',
                category=ScrapyDeprecationWarning,
                stacklevel=1)

        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() or image_key() methods have been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)
        ## end of deprecation warning block

        image_guid = hashlib.sha1(to_bytes(
            url)).hexdigest()  # change to request.url after deprecation
        folder = request.meta['folder']
        if len(folder):
            folder = folder[0]
            #pdb.set_trace()
            table = dict((ord(char), None) for char in "|\\?*<\":>+[]/'")
            folder = folder.translate(table)
        else:
            folder = u'未知'
        #pdb.set_trace()
        return '%s/%s.jpg' % (folder, image_guid)

Example #44

0

Show file

File: jsonrpc.py Project: robd003/scrapy-jsonrpc

def jsonrpc_client_call(url, method, *args, **kwargs):
    """Execute a JSON-RPC call on the given url"""
    if args and kwargs:
        raise ValueError(
            "Pass *args or **kwargs but not both to jsonrpc_client_call")
    req = {
        'jsonrpc': '2.0',
        'method': method,
        'params': args or kwargs,
        'id': 1
    }
    data = to_bytes(json.dumps(req))
    body = urllib.request.urlopen(url, data).read()
    res = json.loads(body.decode('utf-8'))
    if 'result' in res:
        return res['result']
    elif 'error' in res:
        er = res['error']
        raise JsonRpcError(er['code'], er['message'], er['data'])
    else:
        msg = "JSON-RPC response must contain 'result' or 'error': %s" % res
        raise ValueError(msg)

Example #45

0

Show file

File: exporters.py Project: apehex/adspying

    def export_item(self, item):
        if self.first_item:
            self.first_item = False
        else:
            self.file.write(b',')
            self._beautify_newline()


        __item_properties = dict(self._get_serialized_fields(item))

        __item = {
            'type': 'Feature',
            'properties': __item_properties,
            'geometry': {
                'type': 'Point',
                'coordinates': (
                    float(__item_properties.get('longitude', 0.0)),
                    float(__item_properties.get('latitude', 0.0)))}}

        self.file.write(to_bytes(
            self.encoder.encode(__item),
            self.encoding))

Example #46

0

Show file

File: test_downloader_handlers.py Project: zachlewis2020/scrapy

    def test_ftp_local_filename(self):
        f, local_fname = tempfile.mkstemp()
        local_fname = to_bytes(local_fname)
        os.close(f)
        meta = {"ftp_local_filename": local_fname}
        meta.update(self.req_meta)
        request = Request(url="ftp://127.0.0.1:%s/file.txt" % self.portNum,
                          meta=meta)
        d = self.download_handler.download_request(request, None)

        def _test(r):
            self.assertEqual(r.body, local_fname)
            self.assertEqual(r.headers, {
                b'Local Filename': [local_fname],
                b'Size': [b'17']
            })
            self.assertTrue(os.path.exists(local_fname))
            with open(local_fname, "rb") as f:
                self.assertEqual(f.read(), b"I have the power!")
            os.remove(local_fname)

        return self._add_test_callbacks(d, _test)

Example #47

0

Show file

    def file_path(self, request, response=None, info=None):
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                          'please use file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)

        image_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return 'full/movie250/%s.jpg' % (image_guid)

Example #48

0

Show file

    def process_item(self, item, spider):
        session = self.session()
        processing = Processing(s3_raw=hashlib.sha1(to_bytes(item['doc_url'])).hexdigest())
        session.add(processing)

        policy = Policy(
            country=item['country'],
            state = item['state'],
            data_source=item['data_source'],
            law_class = item['law_class'],
            title=item['title'],
            reference=item['reference'],
            authorship=item['authorship'],
            resume=item['summary'],
            publication_date=item['publication_date'],
            url=item['url'],
            doc_url=item['doc_url'] + self.API_key,
            doc_name=item['doc_name'],
            processing = processing
        )
        session.merge(policy)
        #print(policy)
        session.commit()

Example #49

0

Show file

 def _downloadImg(imgUrl):
     savePrefix = "/root/SPIDERIMAGESDB/DATASOURCE/Lofter/" + strDate + "/"
     if not os.path.isdir(savePrefix):
         os.makedirs(savePrefix)
     image_guid = hashlib.sha1(to_bytes(imgUrl)).hexdigest()
     image_name = None
     if ".jpg" in imgUrl:
         image_name = image_guid + ".jpg"
     if ".png" in imgUrl:
         image_name = image_guid + ".png"
     if ".jpeg" in imgUrl:
         image_name = image_guid + ".jpeg"
     if image_name == None:
         return
     try:
         with open(savePrefix+image_name, "wb") as writer:
             writer.write(urllib.request.urlopen(imgUrl).read())
     except Exception as es:
         print(es)
     with codecs.open("./download" + strDate + ".csv", mode='a', encoding='utf8') as writer:
         writer.write(imgUrl+',Lofter/'+strDate+'/'+image_name)
         writer.write('\n')
         print(imgUrl, image_name)

Example #50

0

Show file

    def process_request(self, request, spider):
        self.driver.get(request.url)
        if spider.name == 'wanted' and request.meta != {}:
            time.sleep(2)
            some_tag = self.driver.find_element_by_class_name(
                '_3h_f6TfissC0l7ogPcn7lY')
            action = ActionChains(self.driver)
            action.move_to_element(some_tag).perform()
            time.sleep(1)
        else:
            time.sleep(1.2)
        if spider.name == 'wanted' and request.meta == {}:
            self.infinite_scroll_control()
        elif spider.name == 'roketpunch' and 'job_card_company' in request.meta:
            self.see_more_button_click()
        elif spider.name == 'naver' and request.meta == {}:
            self.naver_more_button()

        body = to_bytes(text=self.driver.page_source)
        return HtmlResponse(url=request.url,
                            body=body,
                            encoding='utf-8',
                            request=request)

Example #51

0

Show file

    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(
                'ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                'please use file_path(request, response=None, info=None) instead',
                category=ScrapyDeprecationWarning,
                stacklevel=1)

        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() or image_key() methods have been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)
        ## end of deprecation warning block

        image_guid = hashlib.sha1(to_bytes(
            url)).hexdigest()  # change to request.url after deprecation

        # 重新此方法，如果 item 中含有 save_sub_dir, 则将其放入此子文件夹
        item = request.meta['item']
        if 'save_sub_dir' in item:
            path = '%s/full/%s.jpg' % (item['save_sub_dir'], image_guid)
        else:
            path = 'full/%s.jpg' % image_guid
        return path

Example #52

0

Show file

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        if spider.browser:
            request.meta[
                'browser'] = self.browser  # to access driver from response
            self.browser.get(request.url)
            # wait js eval
            time.sleep(15)
            body = to_bytes(
                self.browser.page_source)  # body must be of type bytes
            return HtmlResponse(self.browser.current_url,
                                body=body,
                                encoding='utf-8',
                                request=request)
        else:
            return None

Example #53

0

Show file

    def parse_page(self, response):
        item = MeituItem()
        item['referer'] = response.meta['item']['referer']
        href = response.xpath("//div[@class='content-pic']/a/img/@src").extract_first()
        image_name = response.xpath("//div[@class='content']/h5/text()").extract_first()
        item['image_name'] = image_name.split('(')[0]
        item['image_hash'] = hashlib.sha1(to_bytes(item['image_name'])).hexdigest()
        try:
            item['image_num'] = image_name.split('(')[1].split(')')[0].zfill(2)
        except IndexError:
            item['image_num'] = '01'
        item['image_urls'] = [href]
        item['spider_name'] = self.name
        yield item

        # next page
        next_href = response.xpath("//a[text()='下一页']/@href").extract_first()
        if next_href:
            next_href = urljoin(response.url, next_href)
            yield scrapy.Request(
                url=next_href,
                callback=self.parse_page,
                meta={'item': {'referer': response.url}}
            )

Example #54

0

Show file

File: middlewares.py Project: ridhish10/linkedin

    def process_request(self, request, spider):
        driver = spider.driver

        logger.info('SeleniumMiddleware - getting the page')
        driver.get(request.url)

        logger.info('SeleniumMiddleware - click more options')
        more_option = get_by_xpath(
            driver, '//div/div/button[@class="more-options dropdown-caret"]')
        more_option.send_keys(Keys.NULL)
        more_option.click()

        logger.info('SeleniumMiddleware - wait for names')
        name = get_by_xpath(driver, '//ul[@class="browse-map-list"]/li/h4/a')
        name.send_keys(Keys.NULL)

        #request.meta['driver'] = self.driver  # to access driver from response

        logging.info('SeleniumMiddleware - retrieving body')
        body = to_bytes(driver.page_source)  # body must be of type bytes
        return HtmlResponse(driver.current_url,
                            body=body,
                            encoding='utf-8',
                            request=request)

Example #55

0

Show file

File: httpcache.py Project: zhangtao1987/scrapy

 def store_response(self, spider, request, response):
     """Store the given response in the cache."""
     rpath = self._get_request_path(spider, request)
     if not os.path.exists(rpath):
         os.makedirs(rpath)
     metadata = {
         'url': request.url,
         'method': request.method,
         'status': response.status,
         'response_url': response.url,
         'timestamp': time(),
     }
     with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
         f.write(to_bytes(repr(metadata)))
     with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
         pickle.dump(metadata, f, protocol=2)
     with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
         f.write(headers_dict_to_raw(response.headers))
     with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
         f.write(response.body)
     with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
         f.write(headers_dict_to_raw(request.headers))
     with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
         f.write(request.body)

Example #56

0

Show file

 def store_response(self, spider, request, response):
     """Store the given response in the cache."""
     rpath = self._get_request_path(spider, request)
     if not os.path.exists(rpath):
         os.makedirs(rpath)
     metadata = {
         "url": request.url,
         "method": request.method,
         "status": response.status,
         "response_url": response.url,
         "timestamp": time(),
     }
     with self._open(os.path.join(rpath, "meta"), "wb") as f:
         f.write(to_bytes(repr(metadata)))
     with self._open(os.path.join(rpath, "pickled_meta"), "wb") as f:
         pickle.dump(metadata, f, protocol=4)
     with self._open(os.path.join(rpath, "response_headers"), "wb") as f:
         f.write(headers_dict_to_raw(response.headers))
     with self._open(os.path.join(rpath, "response_body"), "wb") as f:
         f.write(response.body)
     with self._open(os.path.join(rpath, "request_headers"), "wb") as f:
         f.write(headers_dict_to_raw(request.headers))
     with self._open(os.path.join(rpath, "request_body"), "wb") as f:
         f.write(request.body)

Example #57

0

Show file

File: pipelines.py Project: MisterZhouZhou/pythonLearn

    def process_item(self, item, spider):
        '''
        图片下载
        :param item:
        :param spider:
        :return:
        '''
        for image in item['image_urls'].split(','):
            #图片命名
            image_guid = hashlib.sha1(to_bytes(image)).hexdigest()
            image_name = '%s.jpg' % (image_guid)

            house_id = item['from_url'].split('/')[-1].replace('.html', '')
            file_path = '%s/%s' % (self.imagepath, house_id)

            if not os.path.exists(file_path):
                os.makedirs(file_path)

            image_path = '%s/%s/%s' % (self.imagepath, house_id, image_name)

            if not os.path.exists(image_path):
                urlretrieve(image, image_path)
            else:
                raise DropItem('It exists!')

Example #58

0

Show file

File: pipelines.py Project: liuguiyangnwpu/crawlImgs

    def file_path(self, request, response=None, info=None):
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(
                'ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                'please use file_path(request, response=None, info=None) instead',
                category=ScrapyDeprecationWarning,
                stacklevel=1)

        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url
        image_guid = hashlib.sha1(to_bytes(
            url)).hexdigest()  # change to request.url after deprecation
        word = str(request.meta['word']).split('word=')[-1]
        try:
            word = urllib.unquote(word).decode('utf-8')
        except Exception as ex:
            word = urllib.parse.unquote(word)
        print(word + "/%s.jpg" % image_guid)
        return word + '/%s.jpg' % (image_guid)

Example #59

0

Show file

File: images.py Project: PeterLUYP/2016YCProject

    def thumb_path(self, request, thumb_id, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.thumb_key(url) method is deprecated, please use '
                          'thumb_path(request, thumb_id, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from thumb_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if thumb_key() method has been overridden
        if not hasattr(self.thumb_key, '_base'):
            _warn()
            return self.thumb_key(url, thumb_id)
        ## end of deprecation warning block

        thumb_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)

Example #60

0

Show file

File: pipelines.py Project: twright8/kingfisher-scrape

    def file_path(self, request, response=None, info=None):
        start_time = self._get_start_time(info.spider)
        start_time_str = start_time.strftime('%Y%m%d_%H%M%S')
        content_type = ''
        if response:
            # This is to cover the case when the url has . after the last /
            # and the text after the . is not a file extension but the response is a json
            content_type = to_native_str(response.headers['Content-Type'])
        url = request.url
        media_guid = hashlib.sha1(to_bytes(url)).hexdigest()
        media_ext = os.path.splitext(url)[1]

        if hasattr(info.spider, 'ext'):
            media_ext = info.spider.ext
        elif not media_ext or ('json' in content_type
                               and media_ext != '.json'):
            media_ext = '.json'
        # Put files in a directory named after the scraper they came from, and the scraper starttime
        if hasattr(info.spider, 'sample') and info.spider.sample == 'true':
            return '%s_sample/%s/%s%s' % (info.spider.name, start_time_str,
                                          media_guid, media_ext)
        else:
            return '%s/%s/%s%s' % (info.spider.name, start_time_str,
                                   media_guid, media_ext)