def _unpack_gzip(self, fname, key, url): logger.debug("Unpacking data file %s", key) # We have to use a try except block, as this will crash with # permissions denied on windows, when trying to copy an open file # here the temporary file # Therefore we close the file, after copying and then delete it manually try: with gzip.open(fname, "rb") as f_in: with NamedTemporaryFile("wb", delete=False) as f_out: with tqdm( # total=f_in.size, desc="Unpack", unit="B", unit_scale=True, unit_divisor=1024, ) as t: fobj = CallbackIOWrapper(t.update, f_in, "read") while True: chunk = fobj.read(1024) if not chunk: break f_out.write(chunk) f_out.flush() t.reset() import_file_to_cache(url, f_out.name, pkgname=PKGNAME) finally: try: os.remove(f_out.name) except: pass
def open(self, full_path, mode='r', encoding='utf-8'): if not full_path.startswith('oss://'): return super().open(full_path, mode) bucket, path = self._split(full_path) with mute_stderr(): path_exists = bucket.object_exists(path) if 'w' in mode: if path_exists: bucket.delete_object(path) if 'b' in mode: return BinaryOSSFile(bucket, path) return OSSFile(bucket, path) elif mode == 'a': position = bucket.head_object(path).content_length if path_exists else 0 return OSSFile(bucket, path, position=position) else: if not path_exists: raise FileNotFoundError(full_path) obj = bucket.get_object(path) # # auto cache large files to avoid memory issues # if obj.content_length > 200 * 1024 ** 2: # 200M # path = cache_file(full_path) # return super().open(path, mode) if obj.content_length > 200 * 1024 ** 2: # 200M with tqdm(total=obj.content_length, unit='B', unit_scale=True, unit_divisor=1024, leave=False, desc='reading ' + os.path.basename(full_path)) as t: obj = CallbackIOWrapper(t.update, obj, "read") data = obj.read() else: import time data = obj.read() if mode == 'rb': return NullContextWrapper(BytesIO(data)) else: assert mode == 'r' return NullContextWrapper(StringIO(data.decode()))