def test_traits(self): u = Url("http://example.com/foo/bar.csv") self.assertEqual('http',u.proto) self.assertEqual('/foo/bar.csv', u.path) self.assertEqual('http://example.com/foo/bar.csv', str(u)) u.path = '/bar/baz.csv' self.assertEqual('http://example.com/bar/baz.csv', str(u))
def _download(self, url, cache_path): import requests def copy_callback(read, total): if self.callback: self.callback('copy_file', read, total) if self.callback: self.callback('download', url, 0) if url.startswith('s3:'): from appurl.url import Url s3url = Url(url) try: with self.cache.open(cache_path, 'wb') as f: s3url.object.download_fileobj(f) except Exception as e: raise DownloadError("Failed to fetch S3 url '{}': {}".format( url, e)) elif url.startswith('ftp:'): from contextlib import closing with closing(urlopen(url)) as fin: with self.cache.open(cache_path, 'wb') as fout: read_len = 16 * 1024 total_len = 0 while 1: buf = fin.read(read_len) if not buf: break fout.write(buf) total_len += len(buf) if self.callback: copy_callback(len(buf), total_len) else: try: r = requests.get(url, stream=True) r.raise_for_status() except SSLError as e: raise DownloadError("Failed to GET {}: {} ".format(url, e)) # Requests will auto decode gzip responses, but not when streaming. This following # monkey patch is recommended by a core developer at # https://github.com/kennethreitz/requests/issues/2155 if r.headers.get('content-encoding') == 'gzip': r.raw.read = functools.partial(r.raw.read, decode_content=True) with self.cache.open(cache_path, 'wb') as f: copy_file_or_flo(r.raw, f, cb=copy_callback) assert self.cache.exists(cache_path)
def join_dir(self, s): if self.resource_format in ('zip','xlsx'): u = Url(s) return self.clone(fragment=u.path) else: return super().join_dir(s)
def test_fragment(self): u = Url('http://example.com/file.csv') self.assertEqual((None,None), tuple(u.fragment)) self.assertEqual('file.csv', u.target_file) self.assertEqual(None, u.target_segment) self.assertEqual('http://example.com/file.csv', str(u)) u = Url('http://example.com/file.csv#a') self.assertEqual(('a', None), tuple(u.fragment)) self.assertEqual('a', u.target_file) self.assertEqual(None, u.target_segment) self.assertEqual('http://example.com/file.csv#a', str(u)) u = Url('http://example.com/file.csv#a;b') self.assertEqual('a', u.target_file) self.assertEqual('b', u.target_segment) self.assertEqual('http://example.com/file.csv#a;b', str(u))
def test_query_urls(self): url='https://s3.amazonaws.com/private.library.civicknowledge.com/civicknowledge.com-rcfe_health-1/metadata.csv?AWSAccessKeyId=AKIAJFW23EPQCLXRU7DA&Signature=A39XhRP%2FTKAxv%2B%2F5vCubwWPDag0%3D&Expires=1494223447' u = Url(url) self.assertEqual('metadata.csv', str(u.resource_file)) self.assertEqual('csv', str(u.resource_format)) self.assertEqual('metadata.csv', str(u.target_file)) self.assertEqual('csv', str(u.target_format))
def download(self, url): working_dir = self.working_dir if self.working_dir else '' r = Resource() # For local files, don't download, just reference in place. if url.scheme == 'file': r.cache_path = Url(url.resource_url).path r.download_time = None # Many places the file may exist locations = { # What a mess ... abspath(r.cache_path), abspath(r.cache_path.lstrip('/')), abspath(join(working_dir, r.cache_path)), abspath(r.cache_path.lstrip('/')) } for l in locations: if exists(l): r.sys_path = l break else: raise DownloadError( ("File resource does not exist. Found none of:" "\n{}\n\nWorking dir = {}\ncache_path={}\nspec_path={}" ).format('\n'.join(locations), working_dir, r.cache_path, url.path)) else: # Not a local file, so actually need to download it. try: r.cache_path, r.download_time = self._download_with_lock( url.resource_url) except AccessError as e: # Try again, using a URL that we may have configured an account for. This is # primarily S3 urls, with Boto or AWS credential try: r.cache_path, r.download_time = self._download_with_lock( url.auth_resource_url) except AttributeError: raise e r.sys_path = self.cache.getsyspath(r.cache_path) return r
def test_base_url(self): """Simple test of splitting and recombining""" for u_s in ('http://server.com/a/b/c/file.csv','http://server.com/a/b/c/file.csv#a', 'http://server.com/a/b/c/file.csv#a;b', 'http://server.com/a/b/c/archive.zip#file.csv'): self.assertEqual(u_s, str(Url(u_s))) self.assertEqual('file.csv', Url('http://server.com/a/b/c/file.csv').target_file) self.assertEqual('file.csv', Url('http://server.com/a/b/c/file.csv').resource_file) self.assertEqual('http://server.com/a/b/c/file.csv', Url('http://server.com/a/b/c/file.csv').resource_url) self.assertEqual('file.csv', Url('http://server.com/a/b/c/resource.zip#file.csv').target_file) self.assertEqual('resource.zip', Url('http://server.com/a/b/c/resource.zip#file.csv').resource_file)