def _download(url, cache_fs, cache_path, account_accessor, logger, callback): import urllib import requests from fs.errors import ResourceNotFoundError if url.startswith('s3:'): s3 = get_s3(url, account_accessor) pd = parse_url_to_dict(url) try: with cache_fs.open(cache_path, 'wb') as fout: with s3.open(urllib.unquote_plus(pd['path']), 'rb') as fin: copy_file_or_flo(fin, fout, cb=callback) except ResourceNotFoundError: raise ResourceNotFoundError("Failed to find path '{}' in S3 FS '{}' ".format(pd['path'], s3)) elif url.startswith('ftp:'): import shutil from contextlib import closing with closing(urlopen(url)) as fin: with cache_fs.open(cache_path, 'wb') as fout: read_len = 16 * 1024 total_len = 0 while 1: buf = fin.read(read_len) if not buf: break fout.write(buf) total_len += len(buf) if callback: callback(len(buf), total_len) else: r = requests.get(url, stream=True) r.raise_for_status() # Requests will auto decode gzip responses, but not when streaming. This following # monkey patch is recommended by a core developer at # https://github.com/kennethreitz/requests/issues/2155 if r.headers.get('content-encoding') == 'gzip': r.raw.read = functools.partial(r.raw.read, decode_content=True) with cache_fs.open(cache_path, 'wb') as f: copy_file_or_flo(r.raw, f, cb=callback) assert cache_fs.exists(cache_path)
def get_s3(url, account_accessor): """ Gets file from s3 storage. Args: url (str): url of the file account_accessor (callable): callable returning dictionary with s3 credentials (access and secret at least) Example: get_s3('s3://example.com/file1.csv', lambda url: {'access': '<access>': 'secret': '<secret>'}) Returns: S3FS instance (file-like): """ # TODO: Hack the pyfilesystem fs.opener file to get credentials from a keychain # The monkey patch fixes a bug: https://github.com/boto/boto/issues/2836 _old_match_hostname = ssl.match_hostname # FIXME. This issue is possibly better handled with https://pypi.python.org/pypi/backports.ssl_match_hostname def _new_match_hostname(cert, hostname): if hostname.endswith('.s3.amazonaws.com'): pos = hostname.find('.s3.amazonaws.com') hostname = hostname[:pos].replace('.', '') + hostname[pos:] return _old_match_hostname(cert, hostname) ssl.match_hostname = _new_match_hostname pd = parse_url_to_dict(url) if account_accessor is None or not six.callable(account_accessor): raise TypeError('account_accessor argument must be callable of one argument returning dict.') account = account_accessor(pd['netloc']) # Direct access to the accounts file yeilds 'access', but in the Accounts ORM object, its 'access_key' aws_access_key = account.get('access', account.get('access_key')) aws_secret_key = account.get('secret') missing_credentials = [] if not aws_access_key: missing_credentials.append('access') if not aws_secret_key: missing_credentials.append('secret') if missing_credentials: raise MissingCredentials( 'dict returned by account_accessor callable for {} must contain not empty {} key(s)' .format(pd['netloc'], ', '.join(missing_credentials)), location=pd['netloc'], required_credentials=['access', 'secret'], ) s3 = AmbryS3FS( bucket=pd['netloc'], #prefix=pd['path'], aws_access_key=aws_access_key, aws_secret_key=aws_secret_key ) # ssl.match_hostname = _old_match_hostname return s3