def _download(url, cache_fs, cache_path, account_accessor, logger, callback): import urllib import requests from fs.errors import ResourceNotFoundError if url.startswith('s3:'): s3 = get_s3(url, account_accessor) pd = parse_url_to_dict(url) try: with cache_fs.open(cache_path, 'wb') as fout: with s3.open(urllib.unquote_plus(pd['path']), 'rb') as fin: copy_file_or_flo(fin, fout, cb=callback) except ResourceNotFoundError: raise ResourceNotFoundError("Failed to find path '{}' in S3 FS '{}' ".format(pd['path'], s3)) elif url.startswith('ftp:'): import shutil from contextlib import closing with closing(urlopen(url)) as fin: with cache_fs.open(cache_path, 'wb') as fout: read_len = 16 * 1024 total_len = 0 while 1: buf = fin.read(read_len) if not buf: break fout.write(buf) total_len += len(buf) if callback: callback(len(buf), total_len) else: r = requests.get(url, stream=True) r.raise_for_status() # Requests will auto decode gzip responses, but not when streaming. This following # monkey patch is recommended by a core developer at # https://github.com/kennethreitz/requests/issues/2155 if r.headers.get('content-encoding') == 'gzip': r.raw.read = functools.partial(r.raw.read, decode_content=True) with cache_fs.open(cache_path, 'wb') as f: copy_file_or_flo(r.raw, f, cb=callback) assert cache_fs.exists(cache_path)
def _get_row_gen(self): from fs.errors import NoSysPathError try: return self.excel_iter(self._fstor.syspath, self.spec.segment) except NoSysPathError: # There is no sys path when the file is in a ZipFile, or other non-traditional filesystem. sub_file = self._fstor.sub_cache() with self._fstor.open(mode='rb') as f_in, sub_file.open(self.spec.name, mode='wb') as f_out: copy_file_or_flo(f_in, f_out) spath = sub_file.getsyspath(self.spec.name) return self.excel_iter(spath, self.spec.segment)
def __iter__(self): """Iterate over all of the lines in the file""" from contextlib import closing import six self.start() if six.PY3: import csv f = self._fstor.open('rtU', encoding=(self.spec.encoding or 'utf8')) reader = csv.reader(f) with closing(f): i = 0 try: for row in reader: i += 1 yield row except Exception as e: raise from ambry_sources.sources.exceptions import SourceError raise SourceError(str(type(e)) + ';' + e.message + "; line={}".format(i)) else: import unicodecsv as csv # What a mess. In the PyFS interface, The 'b' option conflicts with the 'U' open,and # readline is hardcoded to use '\n' anyway. # BTW, the need for both may result from the file being saved on a mac. If all else fails, # try loading it into a spreadsheet format and save with normal line endings. # Need to copy the file, since it may be in a Zip file import tempfile from ambry_sources.util import copy_file_or_flo fout = tempfile.NamedTemporaryFile(delete=False) with self._fstor.open('rb') as fin: copy_file_or_flo(fin, fout) fout.close() with open(fout.name, 'rbU') as f: if self.spec.encoding: reader = csv.reader(f, encoding=self.spec.encoding) else: reader = csv.reader(f) i = 0 try: for row in reader: i += 1 yield row except Exception as e: raise from ambry_sources.sources.exceptions import SourceError raise SourceError(str(type(e)) + ';' + e.message + "; line={}".format(i)) finally: import os os.remove(fout.name) self.finish()