Beispiel #1
0
    def test_traits(self):

        u = Url("http://example.com/foo/bar.csv")

        self.assertEqual('http',u.proto)
        self.assertEqual('/foo/bar.csv', u.path)

        self.assertEqual('http://example.com/foo/bar.csv', str(u))

        u.path = '/bar/baz.csv'

        self.assertEqual('http://example.com/bar/baz.csv', str(u))
Beispiel #2
0
    def _download(self, url, cache_path):
        import requests

        def copy_callback(read, total):
            if self.callback:
                self.callback('copy_file', read, total)

        if self.callback:
            self.callback('download', url, 0)

        if url.startswith('s3:'):

            from appurl.url import Url

            s3url = Url(url)

            try:
                with self.cache.open(cache_path, 'wb') as f:
                    s3url.object.download_fileobj(f)
            except Exception as e:
                raise DownloadError("Failed to fetch S3 url '{}': {}".format(
                    url, e))

        elif url.startswith('ftp:'):
            from contextlib import closing

            with closing(urlopen(url)) as fin:

                with self.cache.open(cache_path, 'wb') as fout:

                    read_len = 16 * 1024
                    total_len = 0
                    while 1:
                        buf = fin.read(read_len)
                        if not buf:
                            break
                        fout.write(buf)
                        total_len += len(buf)

                        if self.callback:
                            copy_callback(len(buf), total_len)

        else:

            try:
                r = requests.get(url, stream=True)
                r.raise_for_status()
            except SSLError as e:
                raise DownloadError("Failed to GET {}: {} ".format(url, e))

            # Requests will auto decode gzip responses, but not when streaming. This following
            # monkey patch is recommended by a core developer at
            # https://github.com/kennethreitz/requests/issues/2155
            if r.headers.get('content-encoding') == 'gzip':
                r.raw.read = functools.partial(r.raw.read, decode_content=True)

            with self.cache.open(cache_path, 'wb') as f:
                copy_file_or_flo(r.raw, f, cb=copy_callback)

            assert self.cache.exists(cache_path)
Beispiel #3
0
    def join_dir(self, s):


        if self.resource_format in ('zip','xlsx'):
            u = Url(s)
            return self.clone(fragment=u.path)
        else:
            return super().join_dir(s)
Beispiel #4
0
    def test_fragment(self):

        u = Url('http://example.com/file.csv')
        self.assertEqual((None,None), tuple(u.fragment))
        self.assertEqual('file.csv', u.target_file)
        self.assertEqual(None, u.target_segment)
        self.assertEqual('http://example.com/file.csv', str(u))


        u = Url('http://example.com/file.csv#a')
        self.assertEqual(('a', None), tuple(u.fragment))
        self.assertEqual('a', u.target_file)
        self.assertEqual(None, u.target_segment)
        self.assertEqual('http://example.com/file.csv#a', str(u))

        u = Url('http://example.com/file.csv#a;b')
        self.assertEqual('a', u.target_file)
        self.assertEqual('b', u.target_segment)
        self.assertEqual('http://example.com/file.csv#a;b', str(u))
Beispiel #5
0
    def test_query_urls(self):

        url='https://s3.amazonaws.com/private.library.civicknowledge.com/civicknowledge.com-rcfe_health-1/metadata.csv?AWSAccessKeyId=AKIAJFW23EPQCLXRU7DA&Signature=A39XhRP%2FTKAxv%2B%2F5vCubwWPDag0%3D&Expires=1494223447'

        u = Url(url)

        self.assertEqual('metadata.csv', str(u.resource_file))
        self.assertEqual('csv', str(u.resource_format))

        self.assertEqual('metadata.csv', str(u.target_file))
        self.assertEqual('csv', str(u.target_format))
Beispiel #6
0
    def download(self, url):

        working_dir = self.working_dir if self.working_dir else ''

        r = Resource()

        # For local files, don't download, just reference in place.
        if url.scheme == 'file':
            r.cache_path = Url(url.resource_url).path
            r.download_time = None

            # Many places the file may exist
            locations = {  # What a mess ...
                abspath(r.cache_path),
                abspath(r.cache_path.lstrip('/')),
                abspath(join(working_dir, r.cache_path)),
                abspath(r.cache_path.lstrip('/'))
            }

            for l in locations:
                if exists(l):
                    r.sys_path = l
                    break
            else:
                raise DownloadError(
                    ("File resource does not exist. Found none of:"
                     "\n{}\n\nWorking dir = {}\ncache_path={}\nspec_path={}"
                     ).format('\n'.join(locations), working_dir, r.cache_path,
                              url.path))

        else:
            # Not a local file, so actually need to download it.
            try:
                r.cache_path, r.download_time = self._download_with_lock(
                    url.resource_url)
            except AccessError as e:
                # Try again, using a URL that we may have configured an account for. This is
                # primarily S3 urls, with Boto or AWS credential
                try:
                    r.cache_path, r.download_time = self._download_with_lock(
                        url.auth_resource_url)
                except AttributeError:
                    raise e

            r.sys_path = self.cache.getsyspath(r.cache_path)

        return r
Beispiel #7
0
    def test_base_url(self):
        """Simple test of splitting and recombining"""

        for u_s in ('http://server.com/a/b/c/file.csv','http://server.com/a/b/c/file.csv#a',
                    'http://server.com/a/b/c/file.csv#a;b', 'http://server.com/a/b/c/archive.zip#file.csv'):
            self.assertEqual(u_s, str(Url(u_s)))

        self.assertEqual('file.csv', Url('http://server.com/a/b/c/file.csv').target_file)
        self.assertEqual('file.csv', Url('http://server.com/a/b/c/file.csv').resource_file)
        self.assertEqual('http://server.com/a/b/c/file.csv', Url('http://server.com/a/b/c/file.csv').resource_url)

        self.assertEqual('file.csv', Url('http://server.com/a/b/c/resource.zip#file.csv').target_file)
        self.assertEqual('resource.zip', Url('http://server.com/a/b/c/resource.zip#file.csv').resource_file)