Esempio n. 1
0
def test_limit_post():
    reader = LimitReader(BytesIO(b'abcdefg'), 3)
    r = requests.request(method='POST',
                         url='http://httpbin.org/post',
                         data=reader,
                         headers={'Content-Length': '3'})

    assert '"abc"' in r.text
Esempio n. 2
0
    def fetch_http(self,
                   url,
                   urlkey=None,
                   env=None,
                   req_headers=None,
                   follow_redirects=False,
                   ignore_proxies=False,
                   verify=True):

        method = 'GET'
        data = None

        proxies = None
        if not ignore_proxies:
            proxies = self.proxies

        if not req_headers:
            req_headers = {}

        if env is not None:
            method = env['REQUEST_METHOD'].upper()
            input_ = env['wsgi.input']

            req_headers.update(self.translate_headers(url, urlkey, env))

            if method in ('POST', 'PUT'):
                len_ = env.get('CONTENT_LENGTH')
                if len_:
                    data = LimitReader(input_, int(len_))
                else:
                    data = input_

        response = requests.request(method=method,
                                    url=url,
                                    data=data,
                                    headers=req_headers,
                                    allow_redirects=follow_redirects,
                                    proxies=proxies,
                                    stream=True,
                                    verify=verify)

        statusline = str(response.status_code) + ' ' + response.reason

        headers = response.headers.items()
        stream = response.raw

        status_headers = StatusAndHeaders(statusline, headers)

        return (status_headers, stream)
Esempio n. 3
0
    def do_upload(self, filename, stream, user, coll, rec, offset, length):
        stream.seek(offset)

        logger.debug('do_upload(): {0} offset: {1}: len: {2}'.format(rec, offset, length))

        stream = LimitReader(stream, length)
        headers = {'Content-Length': str(length)}

        upload_url = self.upload_path.format(record_host=self.record_host,
                                             user=user,
                                             coll=coll,
                                             rec=rec)

        r = requests.put(upload_url,
                         headers=headers,
                         data=stream)
Esempio n. 4
0
    def load(self, url, offset=0, length=-1):
        # first try to fetch url contents from cache
        cache_key = 'warc-'+re.sub('[^\w-]', '', url)
        file_contents = django_cache.get(cache_key)
        if not file_contents:
            # url wasn't in cache -- fetch entire contents of url from super() and put in cache
            file_contents = super(CachedLoader, self).load(url).read()
            django_cache.set(cache_key, file_contents, timeout=60)  # use a short timeout so large warcs don't evict everything else in the cache

        # turn string contents of url into file-like object
        afile = StringIO.StringIO(file_contents)

        # --- from here down is taken from super() ---
        if offset > 0:
            afile.seek(offset)

        if length >= 0:
            return LimitReader(afile, length)
        else:
            return afile
Esempio n. 5
0
    def load(self, url, offset=0, length=-1):

        # first try to fetch url contents from cache
        cache_key = 'warc-' + re.sub('[^\w-]', '', url)
        mirror_name_cache_key = cache_key + '-mirror-name'
        mirror_name = ''

        file_contents = django_cache.get(cache_key)
        if file_contents is None:
            # url wasn't in cache -- load contents

            # try fetching from each mirror in the LOCKSS network, in random order
            if settings.USE_LOCKSS_REPLAY:
                mirrors = Mirror.get_cached_mirrors()
                random.shuffle(mirrors)

                for mirror in mirrors:
                    lockss_key = url.replace('file://', '').replace(
                        WARC_STORAGE_PATH,
                        'https://' + settings.HOST + '/lockss/fetch')
                    lockss_url = urljoin(mirror['content_url'], 'ServeContent')
                    try:
                        logging.info("Fetching from %s?url=%s" %
                                     (lockss_url, lockss_key))
                        response = requests.get(lockss_url,
                                                params={'url': lockss_key})
                        assert response.ok
                        file_contents = response.content
                        mirror_name = mirror['name']
                        logging.info("Got content from lockss")
                    except (requests.ConnectionError, requests.Timeout,
                            AssertionError) as e:
                        logging.info("Couldn't get from lockss: %s" % e)

            # If url wasn't in LOCKSS yet or LOCKSS is disabled, fetch from local storage using super()
            if file_contents is None:
                file_contents = super(CachedLoader, self).load(url).read()
                logging.info("Got content from local disk")

            # cache file contents
            # use a short timeout so large warcs don't evict everything else in the cache
            django_cache.set(cache_key, file_contents, timeout=60)
            django_cache.set(mirror_name_cache_key, mirror_name, timeout=60)

        else:
            mirror_name = django_cache.get(mirror_name_cache_key)
            #logging.info("Got content from cache")

        # set wbrequest.mirror_name so it can be displayed in template later
        thread_local_data.wbrequest.mirror_name = mirror_name

        # turn string contents of url into file-like object
        afile = StringIO.StringIO(file_contents)

        # --- from here down is taken from super() ---
        if offset > 0:
            afile.seek(offset)

        if length >= 0:
            return LimitReader(afile, length)
        else:
            return afile
Esempio n. 6
0
    def fetch_http(self,
                   url,
                   urlkey=None,
                   env=None,
                   req_headers=None,
                   follow_redirects=False,
                   skip_recording=False,
                   verify=True):

        method = 'GET'
        data = None

        proxies = None
        if not skip_recording:
            proxies = self.proxies

        if not req_headers:
            req_headers = {}

        if env is not None:
            method = env['REQUEST_METHOD'].upper()
            input_ = env['wsgi.input']

            req_headers.update(self.translate_headers(url, urlkey, env))

            if method in ('POST', 'PUT'):
                len_ = env.get('CONTENT_LENGTH')
                if len_:
                    data = LimitReader(input_, int(len_))
                else:
                    data = input_

        response = self.live_request(method=method,
                                     url=url,
                                     data=data,
                                     headers=req_headers,
                                     allow_redirects=follow_redirects,
                                     proxies=proxies,
                                     stream=True,
                                     verify=verify)

        statusline = str(response.status_code) + ' ' + response.reason

        headers = response.headers.items()

        stream = response.raw

        try:  #pragma: no cover
            #PY 3
            headers = stream._original_response.headers._headers
        except:  #pragma: no cover
            #PY 2
            headers = []
            resp_headers = stream._original_response.msg.headers
            for h in resp_headers:
                n, v = h.split(':', 1)
                n = n.strip()
                v = v.strip()
                headers.append((n, v))

        status_headers = StatusAndHeaders(statusline, headers)

        return (status_headers, stream)