Ejemplo n.º 1
0
def _download(url, cache_fs, cache_path, account_accessor, logger, callback):

    import urllib
    import requests
    from fs.errors import ResourceNotFoundError

    if url.startswith('s3:'):
        s3 = get_s3(url, account_accessor)
        pd = parse_url_to_dict(url)

        try:
            with cache_fs.open(cache_path, 'wb') as fout:
                with s3.open(urllib.unquote_plus(pd['path']), 'rb') as fin:
                    copy_file_or_flo(fin, fout, cb=callback)
        except ResourceNotFoundError:
            raise ResourceNotFoundError("Failed to find path '{}' in S3 FS '{}' ".format(pd['path'], s3))

    elif url.startswith('ftp:'):
        import shutil
        from contextlib import closing

        with closing(urlopen(url)) as fin:

            with cache_fs.open(cache_path, 'wb') as fout:

                read_len = 16 * 1024
                total_len = 0
                while 1:
                    buf = fin.read(read_len)
                    if not buf:
                        break
                    fout.write(buf)
                    total_len += len(buf)

                    if callback:
                        callback(len(buf), total_len)


    else:

        r = requests.get(url, stream=True)
        r.raise_for_status()

        # Requests will auto decode gzip responses, but not when streaming. This following
        # monkey patch is recommended by a core developer at
        # https://github.com/kennethreitz/requests/issues/2155
        if r.headers.get('content-encoding') == 'gzip':
            r.raw.read = functools.partial(r.raw.read, decode_content=True)

        with cache_fs.open(cache_path, 'wb') as f:
            copy_file_or_flo(r.raw, f, cb=callback)

        assert cache_fs.exists(cache_path)
Ejemplo n.º 2
0
    def _get_row_gen(self):
        from fs.errors import NoSysPathError

        try:
            return self.excel_iter(self._fstor.syspath, self.spec.segment)
        except NoSysPathError:
            # There is no sys path when the file is in a ZipFile, or other non-traditional filesystem.
            sub_file = self._fstor.sub_cache()

            with self._fstor.open(mode='rb') as f_in, sub_file.open(self.spec.name, mode='wb') as f_out:
                copy_file_or_flo(f_in, f_out)

            spath = sub_file.getsyspath(self.spec.name)


            return self.excel_iter(spath, self.spec.segment)
Ejemplo n.º 3
0
    def __iter__(self):
        """Iterate over all of the lines in the file"""

        from contextlib import closing
        import six

        self.start()

        if six.PY3:
            import csv
            f = self._fstor.open('rtU', encoding=(self.spec.encoding or 'utf8'))
            reader = csv.reader(f)

            with closing(f):

                i = 0
                try:
                    for row in reader:
                        i += 1

                        yield row
                except Exception as e:
                    raise
                    from ambry_sources.sources.exceptions import SourceError
                    raise SourceError(str(type(e)) + ';' + e.message + "; line={}".format(i))

        else:
            import unicodecsv as csv

            # What a mess. In the PyFS interface, The 'b' option conflicts with the 'U' open,and
            # readline is hardcoded to use '\n' anyway.
            # BTW, the need for both may result from the file being saved on a mac. If all else fails,
            # try loading it into a spreadsheet format and save with normal line endings.

            # Need to copy the file, since it may be in a Zip file

            import tempfile
            from ambry_sources.util import copy_file_or_flo

            fout = tempfile.NamedTemporaryFile(delete=False)

            with self._fstor.open('rb') as fin:
                copy_file_or_flo(fin, fout)

            fout.close()

            with open(fout.name, 'rbU') as f:

                if self.spec.encoding:
                    reader = csv.reader(f, encoding=self.spec.encoding)
                else:
                    reader = csv.reader(f)

                i = 0
                try:
                    for row in reader:
                        i += 1

                        yield row
                except Exception as e:
                    raise
                    from ambry_sources.sources.exceptions import SourceError
                    raise SourceError(str(type(e)) + ';' + e.message + "; line={}".format(i))

                finally:
                    import os
                    os.remove(fout.name)



        self.finish()