Esempio n. 1
0
    def _unpack_gzip(self, fname, key, url):
        logger.debug("Unpacking data file %s", key)

        # We have to use a try except block, as this will crash with
        # permissions denied on windows, when trying to copy an open file
        # here the temporary file
        # Therefore we close the file, after copying and then delete it manually
        try:
            with gzip.open(fname, "rb") as f_in:
                with NamedTemporaryFile("wb", delete=False) as f_out:
                    with tqdm(
                            # total=f_in.size,
                            desc="Unpack",
                            unit="B",
                            unit_scale=True,
                            unit_divisor=1024,
                    ) as t:
                        fobj = CallbackIOWrapper(t.update, f_in, "read")
                        while True:
                            chunk = fobj.read(1024)
                            if not chunk:
                                break
                            f_out.write(chunk)
                        f_out.flush()
                        t.reset()
            import_file_to_cache(url, f_out.name, pkgname=PKGNAME)
        finally:
            try:
                os.remove(f_out.name)
            except:
                pass
Esempio n. 2
0
    def open(self, full_path, mode='r', encoding='utf-8'):
        if not full_path.startswith('oss://'):
            return super().open(full_path, mode)

        bucket, path = self._split(full_path)
        with mute_stderr():
            path_exists = bucket.object_exists(path)
        if 'w' in mode:
            if path_exists:
                bucket.delete_object(path)
            if 'b' in mode:
                return BinaryOSSFile(bucket, path)
            return OSSFile(bucket, path)
        elif mode == 'a':
            position = bucket.head_object(path).content_length if path_exists else 0
            return OSSFile(bucket, path, position=position)
        else:
            if not path_exists:
                raise FileNotFoundError(full_path)
            obj = bucket.get_object(path)
            # # auto cache large files to avoid memory issues
            # if obj.content_length > 200 * 1024 ** 2:  # 200M
            #     path = cache_file(full_path)
            #     return super().open(path, mode)

            if obj.content_length > 200 * 1024 ** 2:  # 200M
                with tqdm(total=obj.content_length, unit='B', unit_scale=True, unit_divisor=1024, leave=False,
                          desc='reading ' + os.path.basename(full_path)) as t:
                    obj = CallbackIOWrapper(t.update, obj, "read")
                    data = obj.read()
            else:
                import time
                data = obj.read()
            if mode == 'rb':
                return NullContextWrapper(BytesIO(data))
            else:
                assert mode == 'r'
                return NullContextWrapper(StringIO(data.decode()))