def test_xz(self): lzma = compat.import_lzma() with open(self.csv1, 'rb') as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean() as path: tmp = lzma.LZMAFile(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, compression='xz') tm.assert_frame_equal(result, expected) with open(path, 'rb') as f: result = self.read_csv(f, compression='xz') tm.assert_frame_equal(result, expected) with tm.ensure_clean('test.xz') as path: tmp = lzma.LZMAFile(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, compression='infer') tm.assert_frame_equal(result, expected)
def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): """Gets file handle for given path and mode. """ if compression is not None: if encoding is not None and not compat.PY3: msg = 'encoding + compression not yet supported in Python 2' raise ValueError(msg) if compression == 'gzip': import gzip f = gzip.GzipFile(path, mode) elif compression == 'bz2': import bz2 f = bz2.BZ2File(path, mode) elif compression == 'zip': import zipfile zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() if len(zip_names) == 1: file_name = zip_names.pop() f = zip_file.open(file_name) elif len(zip_names) == 0: raise ValueError( 'Zero files found in ZIP file {}'.format(path)) else: raise ValueError( 'Multiple files found in ZIP file.' ' Only one file per ZIP :{}'.format(zip_names)) elif compression == 'xz': lzma = compat.import_lzma() f = lzma.LZMAFile(path, mode) else: raise ValueError('Unrecognized compression type: %s' % compression) if compat.PY3: from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) return f else: if compat.PY3: if encoding: f = open(path, mode, encoding=encoding) else: f = open(path, mode, errors='replace') else: f = open(path, mode) if memory_map and hasattr(f, 'fileno'): try: g = MMapWrapper(f) f.close() f = g except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level # functionality of the C engine (pd.read_csv), so # leave the file handler as is then pass return f
def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): """Gets file handle for given path and mode. """ if compression is not None: if encoding is not None and not compat.PY3: msg = 'encoding + compression not yet supported in Python 2' raise ValueError(msg) if compression == 'gzip': import gzip f = gzip.GzipFile(path, mode) elif compression == 'bz2': import bz2 f = bz2.BZ2File(path, mode) elif compression == 'zip': import zipfile zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() if len(zip_names) == 1: file_name = zip_names.pop() f = zip_file.open(file_name) elif len(zip_names) == 0: raise ValueError('Zero files found in ZIP file {}' .format(path)) else: raise ValueError('Multiple files found in ZIP file.' ' Only one file per ZIP :{}' .format(zip_names)) elif compression == 'xz': lzma = compat.import_lzma() f = lzma.LZMAFile(path, mode) else: raise ValueError('Unrecognized compression type: %s' % compression) if compat.PY3: from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) return f else: if compat.PY3: if encoding: f = open(path, mode, encoding=encoding) else: f = open(path, mode, errors='replace') else: f = open(path, mode) if memory_map and hasattr(f, 'fileno'): try: f = MMapWrapper(f) except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level # functionality of the C engine (pd.read_csv), so # leave the file handler as is then pass return f
def lzma_file(): """ Try to load the `LZMAFile` class from `backports.lzma`. Returns ------- klass : type or None """ try: lzma = compat.import_lzma() except ImportError: lzma = None return getattr(lzma, "LZMAFile", None)
def _get_handle(path, mode, encoding=None, compression=None): """Gets file handle for given path and mode. """ if compression is not None: if encoding is not None and not compat.PY3: msg = 'encoding + compression not yet supported in Python 2' raise ValueError(msg) if compression == 'gzip': import gzip f = gzip.GzipFile(path, mode) elif compression == 'bz2': import bz2 f = bz2.BZ2File(path, mode) elif compression == 'zip': import zipfile zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() if len(zip_names) == 1: file_name = zip_names.pop() f = zip_file.open(file_name) elif len(zip_names) == 0: raise ValueError('Zero files found in ZIP file {}' .format(path)) else: raise ValueError('Multiple files found in ZIP file.' ' Only one file per ZIP :{}' .format(zip_names)) elif compression == 'xz': lzma = compat.import_lzma() f = lzma.LZMAFile(path, mode) else: raise ValueError('Unrecognized compression type: %s' % compression) if compat.PY3: from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) return f else: if compat.PY3: if encoding: f = open(path, mode, encoding=encoding) else: f = open(path, mode, errors='replace') else: f = open(path, mode) return f
def decompress_file(path, compression): if compression is None: f = open(path, 'rb') elif compression == 'gzip': import gzip f = gzip.GzipFile(path, 'rb') elif compression == 'bz2': import bz2 f = bz2.BZ2File(path, 'rb') elif compression == 'xz': lzma = compat.import_lzma() f = lzma.open(path, 'rb') else: msg = 'Unrecognized compression type: {}'.format(compression) raise ValueError(msg) result = f.read().decode('utf8') f.close() return result
def test_to_csv_compression_xz(self): # GH11852 # use the compression kw in to_csv df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) with ensure_clean() as filename: df.to_csv(filename, compression="xz") # test the round trip - to_csv -> read_csv rs = read_csv(filename, compression="xz", index_col=0) assert_frame_equal(df, rs) # explicitly make sure file is xzipped lzma = compat.import_lzma() f = lzma.open(filename, 'rb') assert_frame_equal(df, read_csv(f, index_col=0)) f.close()
def test_to_csv_compression_xz(self): # GH11852 # use the compression kw in to_csv tm._skip_if_no_lzma() df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) with ensure_clean() as filename: df.to_csv(filename, compression="xz") # test the round trip - to_csv -> read_csv rs = read_csv(filename, compression="xz", index_col=0) assert_frame_equal(df, rs) # explicitly make sure file is xzipped lzma = compat.import_lzma() f = lzma.open(filename, 'rb') assert_frame_equal(df, read_csv(f, index_col=0)) f.close()
def _skip_if_no_lzma(): try: import_lzma() except ImportError: return True
def _get_handle(path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True): """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : a path (str) or buffer mode : str mode to open path_or_buf with encoding : str or None compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.) Returns ------- f : file-like A file-like object handles : list of file-like objects A list of file-like object that were opened in this function. """ try: from s3fs import S3File need_text_wrapping = (BytesIO, S3File) except ImportError: need_text_wrapping = (BytesIO,) handles = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, compat.string_types) if is_path: compression = _infer_compression(path_or_buf, compression) if compression: if compat.PY2 and not is_path and encoding: msg = 'compression with encoding is not yet supported in Python 2' raise ValueError(msg) # GZ Compression if compression == 'gzip': import gzip if is_path: f = gzip.open(path_or_buf, mode) else: f = gzip.GzipFile(fileobj=path_or_buf) # BZ Compression elif compression == 'bz2': import bz2 if is_path: f = bz2.BZ2File(path_or_buf, mode) elif compat.PY2: # Python 2's bz2 module can't take file objects, so have to # run through decompress manually f = StringIO(bz2.decompress(path_or_buf.read())) path_or_buf.close() else: f = bz2.BZ2File(path_or_buf) # ZIP Compression elif compression == 'zip': zf = BytesZipFile(path_or_buf, mode) # Ensure the container is closed as well. handles.append(zf) if zf.mode == 'w': f = zf elif zf.mode == 'r': zip_names = zf.namelist() if len(zip_names) == 1: f = zf.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError('Zero files found in ZIP file {}' .format(path_or_buf)) else: raise ValueError('Multiple files found in ZIP file.' ' Only one file per ZIP: {}' .format(zip_names)) # XZ Compression elif compression == 'xz': lzma = compat.import_lzma() f = lzma.LZMAFile(path_or_buf, mode) # Unrecognized Compression else: msg = 'Unrecognized compression type: {}'.format(compression) raise ValueError(msg) handles.append(f) elif is_path: if compat.PY2: # Python 2 mode = "wb" if mode == "w" else mode f = open(path_or_buf, mode) elif encoding: # Python 3 and encoding f = open(path_or_buf, mode, encoding=encoding, newline="") elif is_text: # Python 3 and no explicit encoding f = open(path_or_buf, mode, errors='replace', newline="") else: # Python 3 and binary mode f = open(path_or_buf, mode) handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding if (compat.PY3 and is_text and (compression or isinstance(f, need_text_wrapping))): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding, newline='') handles.append(f) if memory_map and hasattr(f, 'fileno'): try: g = MMapWrapper(f) f.close() f = g except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level # functionality of the C engine (pd.read_csv), so # leave the file handler as is then pass return f, handles
def _get_handle(path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True): """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : a path (str) or buffer mode : str mode to open path_or_buf with encoding : str or None compression : str or None Supported compression protocols are gzip, bz2, zip, and xz memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.) Returns ------- f : file-like A file-like object handles : list of file-like objects A list of file-like object that were opened in this function. """ try: from s3fs import S3File need_text_wrapping = (BytesIO, S3File) except ImportError: need_text_wrapping = (BytesIO,) handles = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, compat.string_types) if compression: if compat.PY2 and not is_path and encoding: msg = 'compression with encoding is not yet supported in Python 2' raise ValueError(msg) # GZ Compression if compression == 'gzip': import gzip if is_path: f = gzip.open(path_or_buf, mode) else: f = gzip.GzipFile(fileobj=path_or_buf) # BZ Compression elif compression == 'bz2': import bz2 if is_path: f = bz2.BZ2File(path_or_buf, mode) elif compat.PY2: # Python 2's bz2 module can't take file objects, so have to # run through decompress manually f = StringIO(bz2.decompress(path_or_buf.read())) path_or_buf.close() else: f = bz2.BZ2File(path_or_buf) # ZIP Compression elif compression == 'zip': zf = BytesZipFile(path_or_buf, mode) if zf.mode == 'w': f = zf elif zf.mode == 'r': zip_names = zf.namelist() if len(zip_names) == 1: f = zf.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError('Zero files found in ZIP file {}' .format(path_or_buf)) else: raise ValueError('Multiple files found in ZIP file.' ' Only one file per ZIP: {}' .format(zip_names)) # XZ Compression elif compression == 'xz': lzma = compat.import_lzma() f = lzma.LZMAFile(path_or_buf, mode) # Unrecognized Compression else: msg = 'Unrecognized compression type: {}'.format(compression) raise ValueError(msg) handles.append(f) elif is_path: if compat.PY2: # Python 2 f = open(path_or_buf, mode) elif encoding: # Python 3 and encoding f = open(path_or_buf, mode, encoding=encoding) elif is_text: # Python 3 and no explicit encoding f = open(path_or_buf, mode, errors='replace') else: # Python 3 and binary mode f = open(path_or_buf, mode) handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding if compat.PY3 and is_text and\ (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) handles.append(f) if memory_map and hasattr(f, 'fileno'): try: g = MMapWrapper(f) f.close() f = g except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level # functionality of the C engine (pd.read_csv), so # leave the file handler as is then pass return f, handles
import pandas.util._test_decorators as td import pandas as pd from pandas import ( Index, Series, period_range, ) import pandas._testing as tm from pandas.tseries.offsets import ( Day, MonthEnd, ) lzma = import_lzma() @pytest.fixture(scope="module") def current_pickle_data(): # our current version pickle data from pandas.tests.io.generate_legacy_storage_files import create_pickle_data return create_pickle_data() # --------------------- # comparison functions # --------------------- def compare_element(result, expected, typ, version=None): if isinstance(expected, Index):
""" Tests compressed data parsing functionality for all of the parsers defined in parsers.py """ import pytest import pandas as pd import pandas.compat as compat import pandas.util.testing as tm import pandas.util._test_decorators as td import gzip import bz2 try: lzma = compat.import_lzma() except ImportError: lzma = None class CompressionTests(object): def test_zip(self): import zipfile with open(self.csv1, 'rb') as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean('test_file.zip') as path: tmp = zipfile.ZipFile(path, mode='w') tmp.writestr('test_file', data)
def _get_handle(path_or_buf, mode, encoding=None, compression=None, memory_map=False): """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : a path (str) or buffer mode : str mode to open path_or_buf with encoding : str or None compression : str or None Supported compression protocols are gzip, bz2, zip, and xz memory_map : boolean, default False See parsers._parser_params for more information. Returns ------- f : file-like A file-like object handles : list of file-like objects A list of file-like object that were openned in this function. """ handles = list() f = path_or_buf is_path = isinstance(path_or_buf, compat.string_types) if compression: if compat.PY2 and not is_path and encoding: msg = 'compression with encoding is not yet supported in Python 2' raise ValueError(msg) # GZ Compression if compression == 'gzip': import gzip if is_path: f = gzip.open(path_or_buf, mode) else: f = gzip.GzipFile(fileobj=path_or_buf) # BZ Compression elif compression == 'bz2': import bz2 if is_path: f = bz2.BZ2File(path_or_buf, mode) elif compat.PY2: # Python 2's bz2 module can't take file objects, so have to # run through decompress manually f = StringIO(bz2.decompress(path_or_buf.read())) path_or_buf.close() else: f = bz2.BZ2File(path_or_buf) # ZIP Compression elif compression == 'zip': import zipfile zip_file = zipfile.ZipFile(path_or_buf) zip_names = zip_file.namelist() if len(zip_names) == 1: f = zip_file.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError('Zero files found in ZIP file {}' .format(path_or_buf)) else: raise ValueError('Multiple files found in ZIP file.' ' Only one file per ZIP: {}' .format(zip_names)) # XZ Compression elif compression == 'xz': lzma = compat.import_lzma() f = lzma.LZMAFile(path_or_buf, mode) # Unrecognized Compression else: msg = 'Unrecognized compression type: {}'.format(compression) raise ValueError(msg) handles.append(f) elif is_path: if compat.PY2: # Python 2 f = open(path_or_buf, mode) elif encoding: # Python 3 and encoding f = open(path_or_buf, mode, encoding=encoding) else: # Python 3 and no explicit encoding f = open(path_or_buf, mode, errors='replace') handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding if compat.PY3 and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) handles.append(f) if memory_map and hasattr(f, 'fileno'): try: g = MMapWrapper(f) f.close() f = g except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level # functionality of the C engine (pd.read_csv), so # leave the file handler as is then pass return f, handles
def _get_handle(source, mode, encoding=None, compression=None, memory_map=False): """Gets file handle for given path and mode. """ f = source is_path = isinstance(source, compat.string_types) # in Python 3, convert BytesIO or fileobjects passed with an encoding if compat.PY3 and isinstance(source, compat.BytesIO): from io import TextIOWrapper return TextIOWrapper(source, encoding=encoding) elif compression is not None: compression = compression.lower() if encoding is not None and not compat.PY3 and not is_path: msg = 'encoding + compression not yet supported in Python 2' raise ValueError(msg) # GZ Compression if compression == 'gzip': import gzip f = gzip.GzipFile(source, mode) \ if is_path else gzip.GzipFile(fileobj=source) # BZ Compression elif compression == 'bz2': import bz2 if is_path: f = bz2.BZ2File(source, mode) else: f = bz2.BZ2File(source) if compat.PY3 else StringIO( bz2.decompress(source.read())) # Python 2's bz2 module can't take file objects, so have to # run through decompress manually # ZIP Compression elif compression == 'zip': import zipfile zip_file = zipfile.ZipFile(source) zip_names = zip_file.namelist() if len(zip_names) == 1: f = zip_file.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError('Zero files found in ZIP file {}' .format(source)) else: raise ValueError('Multiple files found in ZIP file.' ' Only one file per ZIP :{}' .format(zip_names)) # XZ Compression elif compression == 'xz': lzma = compat.import_lzma() f = lzma.LZMAFile(source, mode) else: raise ValueError('Unrecognized compression: %s' % compression) if compat.PY3: from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) return f elif is_path: if compat.PY3: if encoding: f = open(source, mode, encoding=encoding) else: f = open(source, mode, errors='replace') else: f = open(source, mode) if memory_map and hasattr(f, 'fileno'): try: g = MMapWrapper(f) f.close() f = g except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level # functionality of the C engine (pd.read_csv), so # leave the file handler as is then pass return f
of the parsers defined in parsers.py """ import bz2 import gzip import pytest import pandas.compat as compat import pandas.util._test_decorators as td import pandas as pd import pandas.util.testing as tm try: lzma = compat.import_lzma() except ImportError: lzma = None class CompressionTests(object): def test_zip(self): import zipfile with open(self.csv1, 'rb') as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean('test_file.zip') as path: with zipfile.ZipFile(path, mode='w') as tmp: