def _get_data(country): """Load the data from disk; otherwise download and save it""" from zipfile import ZipFile from pandas.io.common import get_filepath_or_buffer, _infer_compression data_path = os.path.join(STORAGE_DIR, country.upper() + '.txt') if os.path.exists(data_path): data = pd.read_csv(data_path, dtype={'postal_code': str}) else: import urllib for download_url in DOWNLOAD_URLs: url = download_url.format(country=country) compression = _infer_compression(url, "zip") try: reader, encoding, compression = get_filepath_or_buffer( url)[:3] except urllib.error.HTTPError: continue break with ZipFile(reader) as fh_zip: with fh_zip.open(country.upper() + '.txt') as fh: data = pd.read_csv(fh, sep='\t', header=0, names=DATA_FIELDS, dtype={'postal_code': str}) if not os.path.exists(STORAGE_DIR): os.mkdir(STORAGE_DIR) data.to_csv(data_path, index=None) return data_path, data
def open_file(filepath_or_buffer, mode="r", encoding=None, compression="infer"): if encoding is not None: encoding = re.sub("_", "-", encoding).lower() compression = _infer_compression(filepath_or_buffer, compression) filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer, encoding, compression) is_path = isinstance(filepath_or_buffer, str) if compression: # GZ Compression if compression == "gzip": if is_path: return gzip.open(filepath_or_buffer, mode) return gzip.GzipFile(fileobj=filepath_or_buffer) # BZ Compression elif compression == "bz2": if is_path: return bz2.BZ2File(filepath_or_buffer, mode) return bz2.BZ2File(filepath_or_buffer) # ZIP Compression elif compression == "zip": zip_file = zipfile.ZipFile(filepath_or_buffer) zip_names = zip_file.namelist() if len(zip_names) == 1: return zip_file.open(zip_names.pop()) if len(zip_names) == 0: raise ValueError(f"Zero files found in ZIP file {filepath_or_buffer}") else: raise ValueError("Multiple files found in ZIP file." f" Only one file per ZIP: {filepath_or_buffer}") # XZ Compression elif compression == "xz": return lzma.LZMAFile(filepath_or_buffer, mode) # Unrecognized Compression raise ValueError(f"Unrecognized compression type: {compression}") elif is_path: return open(filepath_or_buffer, mode, encoding=encoding)
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): """ Pickle (serialize) object to input file path Parameters ---------- obj : any object path : string File path compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' a string representing the compression to use in the output file .. versionadded:: 0.20.0 protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible values for this parameter depend on the version of Python. For Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. For Python >= 3.4, 4 is a valid value. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html .. versionadded:: 0.21.0 """ path = _stringify_path(path) inferred_compression = _infer_compression(path, compression) f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False) if protocol < 0: protocol = pkl.HIGHEST_PROTOCOL try: pkl.dump(obj, f, protocol=protocol) finally: for _f in fh: _f.close()
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): """ Pickle (serialize) object to input file path Parameters ---------- obj : any object path : string File path compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' a string representing the compression to use in the output file .. versionadded:: 0.20.0 protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible values for this parameter depend on the version of Python. For Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. For Python >= 3.4, 4 is a valid value. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html .. versionadded:: 0.21.0 """ path = _stringify_path(path) inferred_compression = _infer_compression(path, compression) f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False) if protocol < 0: protocol = pkl.HIGHEST_PROTOCOL try: pkl.dump(obj, f, protocol=protocol) finally: for _f in fh: _f.close()
def to_pickle(obj, path, compression='infer'): """ Pickle (serialize) object to input file path Parameters ---------- obj : any object path : string File path compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' a string representing the compression to use in the output file .. versionadded:: 0.20.0 """ inferred_compression = _infer_compression(path, compression) f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False) try: pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) finally: for _f in fh: _f.close()
def to_pickle(obj, path, compression='infer'): """ Pickle (serialize) object to input file path Parameters ---------- obj : any object path : string File path compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' a string representing the compression to use in the output file .. versionadded:: 0.20.0 """ inferred_compression = _infer_compression(path, compression) f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False) try: pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) finally: for _f in fh: _f.close()
def save_as_pickle(df, filepath, compression='infer', protocol=2): """ Pickle (serialize) object to given file path. Original implementation in pandas.pickle. Modified to ensure pickle files written can be read with Python 2.x and Python 3.x :param DataFrame df: Pandas data frame :param str filepath: File path :param str compression : One of 'infer', 'gzip', 'bz2', 'xz', None, default 'infer' :param int protocol : pickle protocol used, e.g. pkl.HIGHEST_PROTOCOL """ from pandas.io.common import _get_handle, _infer_compression inferred_compression = _infer_compression(filepath, compression) f, fh = _get_handle(filepath, 'wb', compression=inferred_compression, is_text=False) try: pkl.dump(df, f, protocol=protocol) finally: for _f in fh: _f.close()
def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any object) from file. .. warning:: Loading pickled data received from untrusted sources can be unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__. Parameters ---------- path : str File path where the pickled object will be loaded. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. .. versionadded:: 0.20.0 Returns ------- unpickled : same type as object stored in file See Also -------- DataFrame.to_pickle : Pickle (serialize) DataFrame object to file. Series.to_pickle : Pickle (serialize) Series object to file. read_hdf : Read HDF5 file into a DataFrame. read_sql : Read SQL query or database table into a DataFrame. read_parquet : Load a parquet object, returning a DataFrame. Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") >>> unpickled_df = pd.read_pickle("./dummy.pkl") >>> unpickled_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> import os >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) inferred_compression = _infer_compression(path, compression) def read_wrapper(func): # wrapper file handle open/close operation f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) try: return func(f) finally: for _f in fh: _f.close() def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then # try with the compat pickle to handle subclass changes # pass encoding only if its not None as py2 doesn't handle # the param # cpickle # GH 6899 try: with warnings.catch_warnings(record=True): # We want to silencce any warnings about, e.g. moved modules. return read_wrapper(lambda f: pkl.load(f)) except Exception: # reg/patched pickle try: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=False)) # compat pickle except: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=True)) try: return try_read(path) except: if PY3: return try_read(path, encoding='latin1') raise
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): """ Pickle (serialize) object to file. Parameters ---------- obj : any object Any python object. path : str File path where the pickled object will be stored. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. .. versionadded:: 0.20.0 protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible values for this parameter depend on the version of Python. For Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. For Python >= 3.4, 4 is a valid value. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html .. versionadded:: 0.21.0 See Also -------- read_pickle : Load pickled pandas object (or any object) from file. DataFrame.to_hdf : Write DataFrame to an HDF5 file. DataFrame.to_sql : Write DataFrame to a SQL database. DataFrame.to_parquet : Write a DataFrame to the binary parquet format. Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") >>> unpickled_df = pd.read_pickle("./dummy.pkl") >>> unpickled_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> import os >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) inferred_compression = _infer_compression(path, compression) f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False) if protocol < 0: protocol = pkl.HIGHEST_PROTOCOL try: f.write(pkl.dumps(obj, protocol=protocol)) finally: for _f in fh: _f.close()
def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any object) from file. .. warning:: Loading pickled data received from untrusted sources can be unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__. Parameters ---------- path : str File path where the pickled object will be loaded. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. .. versionadded:: 0.20.0 Returns ------- unpickled : type of object stored in file See Also -------- DataFrame.to_pickle : Pickle (serialize) DataFrame object to file. Series.to_pickle : Pickle (serialize) Series object to file. read_hdf : Read HDF5 file into a DataFrame. read_sql : Read SQL query or database table into a DataFrame. read_parquet : Load a parquet object, returning a DataFrame. Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") >>> unpickled_df = pd.read_pickle("./dummy.pkl") >>> unpickled_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> import os >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) inferred_compression = _infer_compression(path, compression) def read_wrapper(func): # wrapper file handle open/close operation f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) try: return func(f) finally: for _f in fh: _f.close() def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then # try with the compat pickle to handle subclass changes # pass encoding only if its not None as py2 doesn't handle # the param # cpickle # GH 6899 try: with warnings.catch_warnings(record=True): # We want to silencce any warnings about, e.g. moved modules. return read_wrapper(lambda f: pkl.load(f)) except Exception: # reg/patched pickle try: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=False)) # compat pickle except: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=True)) try: return try_read(path) except: if PY3: return try_read(path, encoding='latin1') raise
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): """ Pickle (serialize) object to file. Parameters ---------- obj : any object Any python object. path : str File path where the pickled object will be stored. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. .. versionadded:: 0.20.0 protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible values for this parameter depend on the version of Python. For Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. For Python >= 3.4, 4 is a valid value. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html .. versionadded:: 0.21.0 See Also -------- read_pickle : Load pickled pandas object (or any object) from file. DataFrame.to_hdf : Write DataFrame to an HDF5 file. DataFrame.to_sql : Write DataFrame to a SQL database. DataFrame.to_parquet : Write a DataFrame to the binary parquet format. Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") >>> unpickled_df = pd.read_pickle("./dummy.pkl") >>> unpickled_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> import os >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) inferred_compression = _infer_compression(path, compression) f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False) if protocol < 0: protocol = pkl.HIGHEST_PROTOCOL try: f.write(pkl.dumps(obj, protocol=protocol)) finally: for _f in fh: _f.close()
def test_infer_compression_from_path(self, extension, expected, path_type): path = path_type('foo/bar.csv' + extension) compression = icom._infer_compression(path, compression='infer') assert compression == expected
def test_infer_compression_from_path(self, extension, expected, path_type): path = path_type('foo/bar.csv' + extension) compression = icom._infer_compression(path, compression='infer') assert compression == expected
def read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, skip_footer=0, doublequote=True, delim_whitespace=False, as_recarray=None, compact_ints=None, use_unsigned=None, low_memory=True, buffer_lines=None, memory_map=False, float_precision=None): """Read csv file from local disk. Args: filepath: The filepath of the csv file. We only support local files for now. kwargs: Keyword arguments in pandas::from_csv """ kwargs = { 'sep': sep, 'delimiter': delimiter, 'header': header, 'names': names, 'index_col': index_col, 'usecols': usecols, 'squeeze': squeeze, 'prefix': prefix, 'mangle_dupe_cols': mangle_dupe_cols, 'dtype': dtype, 'engine': engine, 'converters': converters, 'true_values': true_values, 'false_values': false_values, 'skipinitialspace': skipinitialspace, 'skiprows': skiprows, 'nrows': nrows, 'na_values': na_values, 'keep_default_na': keep_default_na, 'na_filter': na_filter, 'verbose': verbose, 'skip_blank_lines': skip_blank_lines, 'parse_dates': parse_dates, 'infer_datetime_format': infer_datetime_format, 'keep_date_col': keep_date_col, 'date_parser': date_parser, 'dayfirst': dayfirst, 'iterator': iterator, 'chunksize': chunksize, 'compression': compression, 'thousands': thousands, 'decimal': decimal, 'lineterminator': lineterminator, 'quotechar': quotechar, 'quoting': quoting, 'escapechar': escapechar, 'comment': comment, 'encoding': encoding, 'dialect': dialect, 'tupleize_cols': tupleize_cols, 'error_bad_lines': error_bad_lines, 'warn_bad_lines': warn_bad_lines, 'skipfooter': skipfooter, 'skip_footer': skip_footer, 'doublequote': doublequote, 'delim_whitespace': delim_whitespace, 'as_recarray': as_recarray, 'compact_ints': compact_ints, 'use_unsigned': use_unsigned, 'low_memory': low_memory, 'buffer_lines': buffer_lines, 'memory_map': memory_map, 'float_precision': float_precision, } # Default to Pandas read_csv for non-serializable objects if not isinstance(filepath_or_buffer, str) or \ _infer_compression(filepath_or_buffer, compression) is not None: warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) pd_obj = pd.read_csv(filepath_or_buffer, **kwargs) if isinstance(pd_obj, pd.DataFrame): return from_pandas(pd_obj, get_npartitions()) return pd_obj filepath = filepath_or_buffer # TODO: handle case where header is a list of lines first_line = _get_firstline(filepath) columns = _infer_column(first_line, kwargs=kwargs) if header is None or (header == "infer" and names is not None): first_line = b"" ignore_first_line = False else: ignore_first_line = True offsets = _compute_offset(filepath, get_npartitions(), ignore_first_line=ignore_first_line) # Serialize objects to speed up later use in remote tasks first_line_id = ray.put(first_line) kwargs_id = ray.put(kwargs) df_obj_ids = [] index_obj_ids = [] for start, end in offsets: if start != 0: df, index = _read_csv_with_offset._submit( args=(filepath, start, end, kwargs_id, first_line_id), num_return_vals=2) else: df, index = _read_csv_with_offset._submit( args=(filepath, start, end, kwargs_id), num_return_vals=2) df_obj_ids.append(df) index_obj_ids.append(index) index = get_index.remote(*index_obj_ids) if index_col is not None else None return DataFrame(row_partitions=df_obj_ids, columns=columns, index=index)
def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any other pickled object) from the specified file path Warning: Loading pickled data received from untrusted sources can be unsafe. See: http://docs.python.org/2.7/library/pickle.html Parameters ---------- path : string File path compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. .. versionadded:: 0.20.0 Returns ------- unpickled : type of object stored in file """ path = _stringify_path(path) inferred_compression = _infer_compression(path, compression) def read_wrapper(func): # wrapper file handle open/close operation f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) try: return func(f) finally: for _f in fh: _f.close() def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then # try with the compat pickle to handle subclass changes # pass encoding only if its not None as py2 doesn't handle # the param # cpickle # GH 6899 try: return read_wrapper(lambda f: pkl.load(f)) except Exception: # reg/patched pickle try: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=False)) # compat pickle except: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=True)) try: return try_read(path) except: if PY3: return try_read(path, encoding='latin1') raise
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False, chunksize=None, compression='infer'): """ Convert a JSON string to pandas object. Parameters ---------- path_or_buf : a valid JSON string or file-like, default: None The string could be a URL. Valid URL schemes include http, ftp, s3, gcs, and file. For file URLs, a host is expected. For instance, a local file could be ``file://localhost/path/to/table.json`` orient : string, Indication of expected JSON string format. Compatible JSON strings can be produced by ``to_json()`` with a corresponding orient value. The set of possible orients is: - ``'split'`` : dict like ``{index -> [index], columns -> [columns], data -> [values]}`` - ``'records'`` : list like ``[{column -> value}, ... , {column -> value}]`` - ``'index'`` : dict like ``{index -> {column -> value}}`` - ``'columns'`` : dict like ``{column -> {index -> value}}`` - ``'values'`` : just the values array The allowed and default values depend on the value of the `typ` parameter. * when ``typ == 'series'``, - allowed orients are ``{'split','records','index'}`` - default is ``'index'`` - The Series index must be unique for orient ``'index'``. * when ``typ == 'frame'``, - allowed orients are ``{'split','records','index', 'columns','values', 'table'}`` - default is ``'columns'`` - The DataFrame index must be unique for orients ``'index'`` and ``'columns'``. - The DataFrame columns must be unique for orients ``'index'``, ``'columns'``, and ``'records'``. .. versionadded:: 0.23.0 'table' as an allowed value for the ``orient`` argument typ : type of object to recover (series or frame), default 'frame' dtype : boolean or dict, default True If True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, applies only to the data. convert_axes : boolean, default True Try to convert the axes to the proper dtypes. convert_dates : boolean, default True List of columns to parse for dates; If True, then try to parse datelike columns default is True; a column label is datelike if * it ends with ``'_at'``, * it ends with ``'_time'``, * it begins with ``'timestamp'``, * it is ``'modified'``, or * it is ``'date'`` keep_default_dates : boolean, default True If parsing dates, then parse the default datelike columns numpy : boolean, default False Direct decoding to numpy arrays. Supports numeric data only, but non-numeric column and index labels are supported. Note also that the JSON ordering MUST be the same for each term if numpy=True. precise_float : boolean, default False Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (False) is to use fast but less precise builtin functionality date_unit : string, default None The timestamp unit to detect if converting dates. The default behaviour is to try and detect the correct precision, but if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, milliseconds, microseconds or nanoseconds respectively. encoding : str, default is 'utf-8' The encoding to use to decode py3 bytes. .. versionadded:: 0.19.0 lines : boolean, default False Read the file as a json object per line. .. versionadded:: 0.19.0 chunksize : integer, default None Return JsonReader object for iteration. See the `line-delimted json docs <http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_ for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. .. versionadded:: 0.21.0 compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if path_or_buf is a string ending in '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. .. versionadded:: 0.21.0 Returns ------- result : Series or DataFrame, depending on the value of `typ`. See Also -------- DataFrame.to_json Notes ----- Specific to ``orient='table'``, if a :class:`DataFrame` with a literal :class:`Index` name of `index` gets written with :func:`to_json`, the subsequent read operation will incorrectly set the :class:`Index` name to ``None``. This is because `index` is also used by :func:`DataFrame.to_json` to denote a missing :class:`Index` name, and the subsequent :func:`read_json` operation cannot distinguish between the two. The same limitation is encountered with a :class:`MultiIndex` and any names beginning with ``'level_'``. Examples -------- >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], ... index=['row 1', 'row 2'], ... columns=['col 1', 'col 2']) Encoding/decoding a Dataframe using ``'split'`` formatted JSON: >>> df.to_json(orient='split') '{"columns":["col 1","col 2"], "index":["row 1","row 2"], "data":[["a","b"],["c","d"]]}' >>> pd.read_json(_, orient='split') col 1 col 2 row 1 a b row 2 c d Encoding/decoding a Dataframe using ``'index'`` formatted JSON: >>> df.to_json(orient='index') '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' >>> pd.read_json(_, orient='index') col 1 col 2 row 1 a b row 2 c d Encoding/decoding a Dataframe using ``'records'`` formatted JSON. Note that index labels are not preserved with this encoding. >>> df.to_json(orient='records') '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' >>> pd.read_json(_, orient='records') col 1 col 2 0 a b 1 c d Encoding with Table Schema >>> df.to_json(orient='table') '{"schema": {"fields": [{"name": "index", "type": "string"}, {"name": "col 1", "type": "string"}, {"name": "col 2", "type": "string"}], "primaryKey": "index", "pandas_version": "0.20.0"}, "data": [{"index": "row 1", "col 1": "a", "col 2": "b"}, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, ) json_reader = JsonReader( filepath_or_buffer, orient=orient, typ=typ, dtype=dtype, convert_axes=convert_axes, convert_dates=convert_dates, keep_default_dates=keep_default_dates, numpy=numpy, precise_float=precise_float, date_unit=date_unit, encoding=encoding, lines=lines, chunksize=chunksize, compression=compression, ) if chunksize: return json_reader result = json_reader.read() if should_close: try: filepath_or_buffer.close() except: # noqa: flake8 pass return result
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False, chunksize=None, compression='infer'): """ Convert a JSON string to pandas object Parameters ---------- path_or_buf : a valid JSON string or file-like, default: None The string could be a URL. Valid URL schemes include http, ftp, s3, gcs, and file. For file URLs, a host is expected. For instance, a local file could be ``file://localhost/path/to/table.json`` orient : string, Indication of expected JSON string format. Compatible JSON strings can be produced by ``to_json()`` with a corresponding orient value. The set of possible orients is: - ``'split'`` : dict like ``{index -> [index], columns -> [columns], data -> [values]}`` - ``'records'`` : list like ``[{column -> value}, ... , {column -> value}]`` - ``'index'`` : dict like ``{index -> {column -> value}}`` - ``'columns'`` : dict like ``{column -> {index -> value}}`` - ``'values'`` : just the values array The allowed and default values depend on the value of the `typ` parameter. * when ``typ == 'series'``, - allowed orients are ``{'split','records','index'}`` - default is ``'index'`` - The Series index must be unique for orient ``'index'``. * when ``typ == 'frame'``, - allowed orients are ``{'split','records','index', 'columns','values', 'table'}`` - default is ``'columns'`` - The DataFrame index must be unique for orients ``'index'`` and ``'columns'``. - The DataFrame columns must be unique for orients ``'index'``, ``'columns'``, and ``'records'``. .. versionadded:: 0.23.0 'table' as an allowed value for the ``orient`` argument typ : type of object to recover (series or frame), default 'frame' dtype : boolean or dict, default True If True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, applies only to the data. convert_axes : boolean, default True Try to convert the axes to the proper dtypes. convert_dates : boolean, default True List of columns to parse for dates; If True, then try to parse datelike columns default is True; a column label is datelike if * it ends with ``'_at'``, * it ends with ``'_time'``, * it begins with ``'timestamp'``, * it is ``'modified'``, or * it is ``'date'`` keep_default_dates : boolean, default True If parsing dates, then parse the default datelike columns numpy : boolean, default False Direct decoding to numpy arrays. Supports numeric data only, but non-numeric column and index labels are supported. Note also that the JSON ordering MUST be the same for each term if numpy=True. precise_float : boolean, default False Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (False) is to use fast but less precise builtin functionality date_unit : string, default None The timestamp unit to detect if converting dates. The default behaviour is to try and detect the correct precision, but if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, milliseconds, microseconds or nanoseconds respectively. lines : boolean, default False Read the file as a json object per line. .. versionadded:: 0.19.0 encoding : str, default is 'utf-8' The encoding to use to decode py3 bytes. .. versionadded:: 0.19.0 chunksize: integer, default None Return JsonReader object for iteration. See the `line-delimted json docs <http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_ for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. .. versionadded:: 0.21.0 compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if path_or_buf is a string ending in '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. .. versionadded:: 0.21.0 Returns ------- result : Series or DataFrame, depending on the value of `typ`. Notes ----- Specific to ``orient='table'``, if a :class:`DataFrame` with a literal :class:`Index` name of `index` gets written with :func:`to_json`, the subsequent read operation will incorrectly set the :class:`Index` name to ``None``. This is because `index` is also used by :func:`DataFrame.to_json` to denote a missing :class:`Index` name, and the subsequent :func:`read_json` operation cannot distinguish between the two. The same limitation is encountered with a :class:`MultiIndex` and any names beginning with ``'level_'``. See Also -------- DataFrame.to_json Examples -------- >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], ... index=['row 1', 'row 2'], ... columns=['col 1', 'col 2']) Encoding/decoding a Dataframe using ``'split'`` formatted JSON: >>> df.to_json(orient='split') '{"columns":["col 1","col 2"], "index":["row 1","row 2"], "data":[["a","b"],["c","d"]]}' >>> pd.read_json(_, orient='split') col 1 col 2 row 1 a b row 2 c d Encoding/decoding a Dataframe using ``'index'`` formatted JSON: >>> df.to_json(orient='index') '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' >>> pd.read_json(_, orient='index') col 1 col 2 row 1 a b row 2 c d Encoding/decoding a Dataframe using ``'records'`` formatted JSON. Note that index labels are not preserved with this encoding. >>> df.to_json(orient='records') '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' >>> pd.read_json(_, orient='records') col 1 col 2 0 a b 1 c d Encoding with Table Schema >>> df.to_json(orient='table') '{"schema": {"fields": [{"name": "index", "type": "string"}, {"name": "col 1", "type": "string"}, {"name": "col 2", "type": "string"}], "primaryKey": "index", "pandas_version": "0.20.0"}, "data": [{"index": "row 1", "col 1": "a", "col 2": "b"}, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, ) json_reader = JsonReader( filepath_or_buffer, orient=orient, typ=typ, dtype=dtype, convert_axes=convert_axes, convert_dates=convert_dates, keep_default_dates=keep_default_dates, numpy=numpy, precise_float=precise_float, date_unit=date_unit, encoding=encoding, lines=lines, chunksize=chunksize, compression=compression, ) if chunksize: return json_reader result = json_reader.read() if should_close: try: filepath_or_buffer.close() except: # noqa: flake8 pass return result
def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any other pickled object) from the specified file path Warning: Loading pickled data received from untrusted sources can be unsafe. See: http://docs.python.org/2.7/library/pickle.html Parameters ---------- path : string File path compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz', or 'zip' respectively, and no decompression otherwise. Set to None for no decompression. .. versionadded:: 0.20.0 Returns ------- unpickled : type of object stored in file """ inferred_compression = _infer_compression(path, compression) def read_wrapper(func): # wrapper file handle open/close operation f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) try: return func(f) finally: for _f in fh: _f.close() def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then # try with the compat pickle to handle subclass changes # pass encoding only if its not None as py2 doesn't handle # the param # cpickle # GH 6899 try: return read_wrapper(lambda f: pkl.load(f)) except Exception: # reg/patched pickle try: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=False)) # compat pickle except: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=True)) try: return try_read(path) except: if PY3: return try_read(path, encoding='latin1') raise
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, compression='infer', quoting=None, line_terminator='\n', chunksize=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): self.obj = obj if path_or_buf is None: path_or_buf = StringIO() self.path_or_buf, _, _, _ = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode ) self.sep = sep self.na_rep = na_rep self.float_format = float_format self.decimal = decimal self.header = header self.index = index self.index_label = index_label self.mode = mode if encoding is None: encoding = 'utf-8' self.encoding = encoding self.compression = _infer_compression(self.path_or_buf, compression) if quoting is None: quoting = csvlib.QUOTE_MINIMAL self.quoting = quoting if quoting == csvlib.QUOTE_NONE: # prevents crash in _csv quotechar = None self.quotechar = quotechar self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator or os.linesep self.date_format = date_format self.tupleize_cols = tupleize_cols self.has_mi_columns = (isinstance(obj.columns, ABCMultiIndex) and not self.tupleize_cols) # validate mi options if self.has_mi_columns: if cols is not None: raise TypeError("cannot specify cols with a MultiIndex on the " "columns") if cols is not None: if isinstance(cols, ABCIndexClass): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, ABCIndexClass): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) # save it self.cols = cols # preallocate data 2d list self.blocks = self.obj._data.blocks ncols = sum(b.shape[0] for b in self.blocks) self.data = [None] * ncols if chunksize is None: chunksize = (100000 // (len(self.cols) or 1)) or 1 self.chunksize = int(chunksize) self.data_index = obj.index if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and date_format is not None): from pandas import Index self.data_index = Index([x.strftime(date_format) if notna(x) else '' for x in self.data_index]) self.nlevels = getattr(self.data_index, 'nlevels', 1) if not index: self.nlevels = 0
def read_csv( filepath_or_buffer, sep=",", delimiter=None, header="infer", names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression="infer", thousands=None, decimal=b".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, ): """Read csv file from local disk. Args: filepath: The filepath of the csv file. We only support local files for now. kwargs: Keyword arguments in pandas::from_csv """ # The intention of the inspection code is to reduce the amount of # communication we have to do between processes and nodes. We take a quick # pass over the arguments and remove those that are default values so we # don't have to serialize and send them to the workers. Because the # arguments list is so long, this does end up saving time based on the # number of nodes in the cluster. frame = inspect.currentframe() _, _, _, kwargs = inspect.getargvalues(frame) try: args, _, _, defaults, _, _, _ = inspect.getfullargspec(read_csv) defaults = dict(zip(args[1:], defaults)) kwargs = { kw: kwargs[kw] for kw in kwargs if kw in defaults and kwargs[kw] != defaults[kw] } # This happens on Python2, we will just default to serializing the entire dictionary except AttributeError: # We suppress the error and delete the kwargs not needed in the remote function. del kwargs["filepath_or_buffer"] del kwargs["frame"] if isinstance(filepath_or_buffer, str): if not os.path.exists(filepath_or_buffer): warnings.warn( "File not found on disk. Defaulting to Pandas implementation.", UserWarning, ) return _read_csv_from_pandas(filepath_or_buffer, kwargs) elif not isinstance(filepath_or_buffer, py.path.local): read_from_pandas = True # Pandas read_csv supports pathlib.Path try: import pathlib if isinstance(filepath_or_buffer, pathlib.Path): read_from_pandas = False except ImportError: pass if read_from_pandas: warnings.warn( "Reading from buffer. Defaulting to Pandas implementation.", UserWarning) return _read_csv_from_pandas(filepath_or_buffer, kwargs) if _infer_compression(filepath_or_buffer, compression) is not None: warnings.warn( "Compression detected. Defaulting to Pandas implementation.", UserWarning) return _read_csv_from_pandas(filepath_or_buffer, kwargs) if chunksize is not None: warnings.warn( "Reading chunks from a file. Defaulting to Pandas implementation.", UserWarning, ) return _read_csv_from_pandas(filepath_or_buffer, kwargs) if skiprows is not None and not isinstance(skiprows, int): warnings.warn(("Defaulting to Pandas implementation. To speed up " "read_csv through the Modin implementation, " "comment the rows to skip instead.")) return _read_csv_from_pandas(filepath_or_buffer, kwargs) # TODO: replace this by reading lines from file. if nrows is not None: warnings.warn("Defaulting to Pandas implementation.", UserWarning) return _read_csv_from_pandas(filepath_or_buffer, kwargs) else: return _read_csv_from_file_pandas_on_ray(filepath_or_buffer, kwargs)
def _read(cls, filepath_or_buffer, **kwargs): """Read csv file from local disk. Args: filepath_or_buffer: The filepath of the csv file. We only support local files for now. kwargs: Keyword arguments in pandas.read_csv """ # The intention of the inspection code is to reduce the amount of # communication we have to do between processes and nodes. We take a quick # pass over the arguments and remove those that are default values so we # don't have to serialize and send them to the workers. Because the # arguments list is so long, this does end up saving time based on the # number of nodes in the cluster. try: args, _, _, defaults, _, _, _ = inspect.getfullargspec( cls.read_csv) defaults = dict(zip(args[2:], defaults)) filtered_kwargs = { kw: kwargs[kw] for kw in kwargs if kw in defaults and not isinstance(kwargs[kw], type(defaults[kw])) or kwargs[kw] != defaults[kw] } # This happens on Python2, we will just default to serializing the entire dictionary except AttributeError: filtered_kwargs = kwargs if isinstance(filepath_or_buffer, str): if not os.path.exists(filepath_or_buffer): ErrorMessage.default_to_pandas("File not found on disk") return cls._read_csv_from_pandas(filepath_or_buffer, filtered_kwargs) elif not isinstance(filepath_or_buffer, py.path.local): read_from_pandas = True # Pandas read_csv supports pathlib.Path try: import pathlib if isinstance(filepath_or_buffer, pathlib.Path): read_from_pandas = False except ImportError: pass if read_from_pandas: ErrorMessage.default_to_pandas("Reading from buffer.") return cls._read_csv_from_pandas(filepath_or_buffer, kwargs) if (_infer_compression(filepath_or_buffer, kwargs.get("compression")) is not None): ErrorMessage.default_to_pandas("Compression detected.") return cls._read_csv_from_pandas(filepath_or_buffer, filtered_kwargs) chunksize = kwargs.get("chunksize") if chunksize is not None: ErrorMessage.default_to_pandas("Reading chunks from a file.") return cls._read_csv_from_pandas(filepath_or_buffer, filtered_kwargs) skiprows = kwargs.get("skiprows") if skiprows is not None and not isinstance(skiprows, int): ErrorMessage.default_to_pandas( "skiprows parameter not optimized yet.") return cls._read_csv_from_pandas(filepath_or_buffer, kwargs) # TODO: replace this by reading lines from file. if kwargs.get("nrows") is not None: ErrorMessage.default_to_pandas("`read_csv` with `nrows`") return cls._read_csv_from_pandas(filepath_or_buffer, filtered_kwargs) else: return cls._read_csv_from_file_pandas_on_ray( filepath_or_buffer, filtered_kwargs)
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression='infer', quoting=None, line_terminator='\n', chunksize=None, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): self.obj = obj if path_or_buf is None: path_or_buf = StringIO() self.path_or_buf, _, _, _ = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode) self.sep = sep self.na_rep = na_rep self.float_format = float_format self.decimal = decimal self.header = header self.index = index self.index_label = index_label self.mode = mode if encoding is None: encoding = 'utf-8' self.encoding = encoding self.compression = _infer_compression(self.path_or_buf, compression) if quoting is None: quoting = csvlib.QUOTE_MINIMAL self.quoting = quoting if quoting == csvlib.QUOTE_NONE: # prevents crash in _csv quotechar = None self.quotechar = quotechar self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator or os.linesep self.date_format = date_format self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex) # validate mi options if self.has_mi_columns: if cols is not None: raise TypeError("cannot specify cols with a MultiIndex on the " "columns") if cols is not None: if isinstance(cols, ABCIndexClass): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, ABCIndexClass): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) # save it self.cols = cols # preallocate data 2d list self.blocks = self.obj._data.blocks ncols = sum(b.shape[0] for b in self.blocks) self.data = [None] * ncols if chunksize is None: chunksize = (100000 // (len(self.cols) or 1)) or 1 self.chunksize = int(chunksize) self.data_index = obj.index if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and date_format is not None): from pandas import Index self.data_index = Index([ x.strftime(date_format) if notna(x) else '' for x in self.data_index ]) self.nlevels = getattr(self.data_index, 'nlevels', 1) if not index: self.nlevels = 0