Beispiel #1
0
    def _get_data(country):
        """Load the data from disk; otherwise download and save it"""
        from zipfile import ZipFile
        from pandas.io.common import get_filepath_or_buffer, _infer_compression
        data_path = os.path.join(STORAGE_DIR, country.upper() + '.txt')
        if os.path.exists(data_path):
            data = pd.read_csv(data_path, dtype={'postal_code': str})
        else:
            import urllib
            for download_url in DOWNLOAD_URLs:
                url = download_url.format(country=country)
                compression = _infer_compression(url, "zip")
                try:
                    reader, encoding, compression = get_filepath_or_buffer(
                        url)[:3]
                except urllib.error.HTTPError:
                    continue
                break
            with ZipFile(reader) as fh_zip:
                with fh_zip.open(country.upper() + '.txt') as fh:
                    data = pd.read_csv(fh,
                                       sep='\t',
                                       header=0,
                                       names=DATA_FIELDS,
                                       dtype={'postal_code': str})
            if not os.path.exists(STORAGE_DIR):
                os.mkdir(STORAGE_DIR)
            data.to_csv(data_path, index=None)

        return data_path, data
Beispiel #2
0
def open_file(filepath_or_buffer, mode="r", encoding=None, compression="infer"):
    if encoding is not None:
        encoding = re.sub("_", "-", encoding).lower()

    compression = _infer_compression(filepath_or_buffer, compression)
    filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer, encoding, compression)

    is_path = isinstance(filepath_or_buffer, str)

    if compression:

        # GZ Compression
        if compression == "gzip":
            if is_path:
                return gzip.open(filepath_or_buffer, mode)
            return gzip.GzipFile(fileobj=filepath_or_buffer)

        # BZ Compression
        elif compression == "bz2":
            if is_path:
                return bz2.BZ2File(filepath_or_buffer, mode)
            return bz2.BZ2File(filepath_or_buffer)

        # ZIP Compression
        elif compression == "zip":
            zip_file = zipfile.ZipFile(filepath_or_buffer)
            zip_names = zip_file.namelist()
            if len(zip_names) == 1:
                return zip_file.open(zip_names.pop())
            if len(zip_names) == 0:
                raise ValueError(f"Zero files found in ZIP file {filepath_or_buffer}")
            else:
                raise ValueError("Multiple files found in ZIP file."
                                 f" Only one file per ZIP: {filepath_or_buffer}")

        # XZ Compression
        elif compression == "xz":
            return lzma.LZMAFile(filepath_or_buffer, mode)

        # Unrecognized Compression
        raise ValueError(f"Unrecognized compression type: {compression}")

    elif is_path:
        return open(filepath_or_buffer, mode, encoding=encoding)
Beispiel #3
0
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
    """
    Pickle (serialize) object to input file path

    Parameters
    ----------
    obj : any object
    path : string
        File path
    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
        a string representing the compression to use in the output file

        .. versionadded:: 0.20.0
    protocol : int
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
        values for this parameter depend on the version of Python. For Python
        2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
        For Python >= 3.4, 4 is a valid value. A negative value for the
        protocol parameter is equivalent to setting its value to
        HIGHEST_PROTOCOL.

        .. [1] https://docs.python.org/3/library/pickle.html
        .. versionadded:: 0.21.0


    """
    path = _stringify_path(path)
    inferred_compression = _infer_compression(path, compression)
    f, fh = _get_handle(path,
                        'wb',
                        compression=inferred_compression,
                        is_text=False)
    if protocol < 0:
        protocol = pkl.HIGHEST_PROTOCOL
    try:
        pkl.dump(obj, f, protocol=protocol)
    finally:
        for _f in fh:
            _f.close()
Beispiel #4
0
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
    """
    Pickle (serialize) object to input file path

    Parameters
    ----------
    obj : any object
    path : string
        File path
    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
        a string representing the compression to use in the output file

        .. versionadded:: 0.20.0
    protocol : int
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
        values for this parameter depend on the version of Python. For Python
        2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
        For Python >= 3.4, 4 is a valid value. A negative value for the
        protocol parameter is equivalent to setting its value to
        HIGHEST_PROTOCOL.

        .. [1] https://docs.python.org/3/library/pickle.html
        .. versionadded:: 0.21.0


    """
    path = _stringify_path(path)
    inferred_compression = _infer_compression(path, compression)
    f, fh = _get_handle(path, 'wb',
                        compression=inferred_compression,
                        is_text=False)
    if protocol < 0:
        protocol = pkl.HIGHEST_PROTOCOL
    try:
        pkl.dump(obj, f, protocol=protocol)
    finally:
        for _f in fh:
            _f.close()
Beispiel #5
0
def to_pickle(obj, path, compression='infer'):
    """
    Pickle (serialize) object to input file path

    Parameters
    ----------
    obj : any object
    path : string
        File path
    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
        a string representing the compression to use in the output file

        .. versionadded:: 0.20.0
    """
    inferred_compression = _infer_compression(path, compression)
    f, fh = _get_handle(path, 'wb',
                        compression=inferred_compression,
                        is_text=False)
    try:
        pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
    finally:
        for _f in fh:
            _f.close()
Beispiel #6
0
def to_pickle(obj, path, compression='infer'):
    """
    Pickle (serialize) object to input file path

    Parameters
    ----------
    obj : any object
    path : string
        File path
    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
        a string representing the compression to use in the output file

        .. versionadded:: 0.20.0
    """
    inferred_compression = _infer_compression(path, compression)
    f, fh = _get_handle(path, 'wb',
                        compression=inferred_compression,
                        is_text=False)
    try:
        pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
    finally:
        for _f in fh:
            _f.close()
Beispiel #7
0
def save_as_pickle(df, filepath, compression='infer', protocol=2):
    """
    Pickle (serialize) object to given file path.
    Original implementation in pandas.pickle. Modified to ensure pickle files
    written can be read with Python 2.x and Python 3.x

    :param DataFrame df:  Pandas data frame
    :param str filepath: File path
    :param str compression : One of 'infer', 'gzip', 'bz2', 'xz', None,
                              default 'infer'
    :param int protocol : pickle protocol used, e.g. pkl.HIGHEST_PROTOCOL
    """
    from pandas.io.common import _get_handle, _infer_compression
    inferred_compression = _infer_compression(filepath, compression)
    f, fh = _get_handle(filepath,
                        'wb',
                        compression=inferred_compression,
                        is_text=False)
    try:
        pkl.dump(df, f, protocol=protocol)
    finally:
        for _f in fh:
            _f.close()
Beispiel #8
0
def read_pickle(path, compression='infer'):
    """
    Load pickled pandas object (or any object) from file.

    .. warning::

       Loading pickled data received from untrusted sources can be
       unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.

    Parameters
    ----------
    path : str
        File path where the pickled object will be loaded.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

        .. versionadded:: 0.20.0

    Returns
    -------
    unpickled : same type as object stored in file

    See Also
    --------
    DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
    Series.to_pickle : Pickle (serialize) Series object to file.
    read_hdf : Read HDF5 file into a DataFrame.
    read_sql : Read SQL query or database table into a DataFrame.
    read_parquet : Load a parquet object, returning a DataFrame.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    inferred_compression = _infer_compression(path, compression)

    def read_wrapper(func):
        # wrapper file handle open/close operation
        f, fh = _get_handle(path, 'rb',
                            compression=inferred_compression,
                            is_text=False)
        try:
            return func(f)
        finally:
            for _f in fh:
                _f.close()

    def try_read(path, encoding=None):
        # try with cPickle
        # try with current pickle, if we have a Type Error then
        # try with the compat pickle to handle subclass changes
        # pass encoding only if its not None as py2 doesn't handle
        # the param

        # cpickle
        # GH 6899
        try:
            with warnings.catch_warnings(record=True):
                # We want to silencce any warnings about, e.g. moved modules.
                return read_wrapper(lambda f: pkl.load(f))
        except Exception:
            # reg/patched pickle
            try:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=False))
            # compat pickle
            except:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=True))
    try:
        return try_read(path)
    except:
        if PY3:
            return try_read(path, encoding='latin1')
        raise
Beispiel #9
0
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
    """
    Pickle (serialize) object to file.

    Parameters
    ----------
    obj : any object
        Any python object.
    path : str
        File path where the pickled object will be stored.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        A string representing the compression to use in the output file. By
        default, infers from the file extension in specified path.

        .. versionadded:: 0.20.0
    protocol : int
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
        values for this parameter depend on the version of Python. For Python
        2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
        For Python >= 3.4, 4 is a valid value. A negative value for the
        protocol parameter is equivalent to setting its value to
        HIGHEST_PROTOCOL.

        .. [1] https://docs.python.org/3/library/pickle.html
        .. versionadded:: 0.21.0

    See Also
    --------
    read_pickle : Load pickled pandas object (or any object) from file.
    DataFrame.to_hdf : Write DataFrame to an HDF5 file.
    DataFrame.to_sql : Write DataFrame to a SQL database.
    DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    inferred_compression = _infer_compression(path, compression)
    f, fh = _get_handle(path, 'wb',
                        compression=inferred_compression,
                        is_text=False)
    if protocol < 0:
        protocol = pkl.HIGHEST_PROTOCOL
    try:
        f.write(pkl.dumps(obj, protocol=protocol))
    finally:
        for _f in fh:
            _f.close()
Beispiel #10
0
def read_pickle(path, compression='infer'):
    """
    Load pickled pandas object (or any object) from file.

    .. warning::

       Loading pickled data received from untrusted sources can be
       unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.

    Parameters
    ----------
    path : str
        File path where the pickled object will be loaded.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

        .. versionadded:: 0.20.0

    Returns
    -------
    unpickled : type of object stored in file

    See Also
    --------
    DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
    Series.to_pickle : Pickle (serialize) Series object to file.
    read_hdf : Read HDF5 file into a DataFrame.
    read_sql : Read SQL query or database table into a DataFrame.
    read_parquet : Load a parquet object, returning a DataFrame.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    inferred_compression = _infer_compression(path, compression)

    def read_wrapper(func):
        # wrapper file handle open/close operation
        f, fh = _get_handle(path,
                            'rb',
                            compression=inferred_compression,
                            is_text=False)
        try:
            return func(f)
        finally:
            for _f in fh:
                _f.close()

    def try_read(path, encoding=None):
        # try with cPickle
        # try with current pickle, if we have a Type Error then
        # try with the compat pickle to handle subclass changes
        # pass encoding only if its not None as py2 doesn't handle
        # the param

        # cpickle
        # GH 6899
        try:
            with warnings.catch_warnings(record=True):
                # We want to silencce any warnings about, e.g. moved modules.
                return read_wrapper(lambda f: pkl.load(f))
        except Exception:
            # reg/patched pickle
            try:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=False))
            # compat pickle
            except:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=True))

    try:
        return try_read(path)
    except:
        if PY3:
            return try_read(path, encoding='latin1')
        raise
Beispiel #11
0
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
    """
    Pickle (serialize) object to file.

    Parameters
    ----------
    obj : any object
        Any python object.
    path : str
        File path where the pickled object will be stored.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        A string representing the compression to use in the output file. By
        default, infers from the file extension in specified path.

        .. versionadded:: 0.20.0
    protocol : int
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
        values for this parameter depend on the version of Python. For Python
        2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
        For Python >= 3.4, 4 is a valid value. A negative value for the
        protocol parameter is equivalent to setting its value to
        HIGHEST_PROTOCOL.

        .. [1] https://docs.python.org/3/library/pickle.html
        .. versionadded:: 0.21.0

    See Also
    --------
    read_pickle : Load pickled pandas object (or any object) from file.
    DataFrame.to_hdf : Write DataFrame to an HDF5 file.
    DataFrame.to_sql : Write DataFrame to a SQL database.
    DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    inferred_compression = _infer_compression(path, compression)
    f, fh = _get_handle(path,
                        'wb',
                        compression=inferred_compression,
                        is_text=False)
    if protocol < 0:
        protocol = pkl.HIGHEST_PROTOCOL
    try:
        f.write(pkl.dumps(obj, protocol=protocol))
    finally:
        for _f in fh:
            _f.close()
Beispiel #12
0
 def test_infer_compression_from_path(self, extension, expected, path_type):
     path = path_type('foo/bar.csv' + extension)
     compression = icom._infer_compression(path, compression='infer')
     assert compression == expected
Beispiel #13
0
 def test_infer_compression_from_path(self, extension, expected, path_type):
     path = path_type('foo/bar.csv' + extension)
     compression = icom._infer_compression(path, compression='infer')
     assert compression == expected
Beispiel #14
0
def read_csv(filepath_or_buffer,
             sep=',',
             delimiter=None,
             header='infer',
             names=None,
             index_col=None,
             usecols=None,
             squeeze=False,
             prefix=None,
             mangle_dupe_cols=True,
             dtype=None,
             engine=None,
             converters=None,
             true_values=None,
             false_values=None,
             skipinitialspace=False,
             skiprows=None,
             nrows=None,
             na_values=None,
             keep_default_na=True,
             na_filter=True,
             verbose=False,
             skip_blank_lines=True,
             parse_dates=False,
             infer_datetime_format=False,
             keep_date_col=False,
             date_parser=None,
             dayfirst=False,
             iterator=False,
             chunksize=None,
             compression='infer',
             thousands=None,
             decimal=b'.',
             lineterminator=None,
             quotechar='"',
             quoting=0,
             escapechar=None,
             comment=None,
             encoding=None,
             dialect=None,
             tupleize_cols=None,
             error_bad_lines=True,
             warn_bad_lines=True,
             skipfooter=0,
             skip_footer=0,
             doublequote=True,
             delim_whitespace=False,
             as_recarray=None,
             compact_ints=None,
             use_unsigned=None,
             low_memory=True,
             buffer_lines=None,
             memory_map=False,
             float_precision=None):
    """Read csv file from local disk.

    Args:
        filepath:
              The filepath of the csv file.
              We only support local files for now.
        kwargs: Keyword arguments in pandas::from_csv
    """

    kwargs = {
        'sep': sep,
        'delimiter': delimiter,
        'header': header,
        'names': names,
        'index_col': index_col,
        'usecols': usecols,
        'squeeze': squeeze,
        'prefix': prefix,
        'mangle_dupe_cols': mangle_dupe_cols,
        'dtype': dtype,
        'engine': engine,
        'converters': converters,
        'true_values': true_values,
        'false_values': false_values,
        'skipinitialspace': skipinitialspace,
        'skiprows': skiprows,
        'nrows': nrows,
        'na_values': na_values,
        'keep_default_na': keep_default_na,
        'na_filter': na_filter,
        'verbose': verbose,
        'skip_blank_lines': skip_blank_lines,
        'parse_dates': parse_dates,
        'infer_datetime_format': infer_datetime_format,
        'keep_date_col': keep_date_col,
        'date_parser': date_parser,
        'dayfirst': dayfirst,
        'iterator': iterator,
        'chunksize': chunksize,
        'compression': compression,
        'thousands': thousands,
        'decimal': decimal,
        'lineterminator': lineterminator,
        'quotechar': quotechar,
        'quoting': quoting,
        'escapechar': escapechar,
        'comment': comment,
        'encoding': encoding,
        'dialect': dialect,
        'tupleize_cols': tupleize_cols,
        'error_bad_lines': error_bad_lines,
        'warn_bad_lines': warn_bad_lines,
        'skipfooter': skipfooter,
        'skip_footer': skip_footer,
        'doublequote': doublequote,
        'delim_whitespace': delim_whitespace,
        'as_recarray': as_recarray,
        'compact_ints': compact_ints,
        'use_unsigned': use_unsigned,
        'low_memory': low_memory,
        'buffer_lines': buffer_lines,
        'memory_map': memory_map,
        'float_precision': float_precision,
    }

    # Default to Pandas read_csv for non-serializable objects
    if not isinstance(filepath_or_buffer, str) or \
            _infer_compression(filepath_or_buffer, compression) is not None:

        warnings.warn("Defaulting to Pandas implementation",
                      PendingDeprecationWarning)

        pd_obj = pd.read_csv(filepath_or_buffer, **kwargs)
        if isinstance(pd_obj, pd.DataFrame):
            return from_pandas(pd_obj, get_npartitions())

        return pd_obj

    filepath = filepath_or_buffer

    # TODO: handle case where header is a list of lines
    first_line = _get_firstline(filepath)
    columns = _infer_column(first_line, kwargs=kwargs)
    if header is None or (header == "infer" and names is not None):
        first_line = b""
        ignore_first_line = False
    else:
        ignore_first_line = True

    offsets = _compute_offset(filepath, get_npartitions(),
                              ignore_first_line=ignore_first_line)

    # Serialize objects to speed up later use in remote tasks
    first_line_id = ray.put(first_line)
    kwargs_id = ray.put(kwargs)

    df_obj_ids = []
    index_obj_ids = []
    for start, end in offsets:
        if start != 0:
            df, index = _read_csv_with_offset._submit(
                args=(filepath, start, end, kwargs_id, first_line_id),
                num_return_vals=2)
        else:
            df, index = _read_csv_with_offset._submit(
                args=(filepath, start, end, kwargs_id),
                num_return_vals=2)
        df_obj_ids.append(df)
        index_obj_ids.append(index)

    index = get_index.remote(*index_obj_ids) if index_col is not None else None

    return DataFrame(row_partitions=df_obj_ids, columns=columns, index=index)
Beispiel #15
0
def read_pickle(path, compression='infer'):
    """
    Load pickled pandas object (or any other pickled object) from the specified
    file path

    Warning: Loading pickled data received from untrusted sources can be
    unsafe. See: http://docs.python.org/2.7/library/pickle.html

    Parameters
    ----------
    path : string
        File path
    compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

        .. versionadded:: 0.20.0

    Returns
    -------
    unpickled : type of object stored in file
    """
    path = _stringify_path(path)
    inferred_compression = _infer_compression(path, compression)

    def read_wrapper(func):
        # wrapper file handle open/close operation
        f, fh = _get_handle(path,
                            'rb',
                            compression=inferred_compression,
                            is_text=False)
        try:
            return func(f)
        finally:
            for _f in fh:
                _f.close()

    def try_read(path, encoding=None):
        # try with cPickle
        # try with current pickle, if we have a Type Error then
        # try with the compat pickle to handle subclass changes
        # pass encoding only if its not None as py2 doesn't handle
        # the param

        # cpickle
        # GH 6899
        try:
            return read_wrapper(lambda f: pkl.load(f))
        except Exception:
            # reg/patched pickle
            try:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=False))
            # compat pickle
            except:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=True))

    try:
        return try_read(path)
    except:
        if PY3:
            return try_read(path, encoding='latin1')
        raise
Beispiel #16
0
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
              convert_axes=True, convert_dates=True, keep_default_dates=True,
              numpy=False, precise_float=False, date_unit=None, encoding=None,
              lines=False, chunksize=None, compression='infer'):
    """
    Convert a JSON string to pandas object.

    Parameters
    ----------
    path_or_buf : a valid JSON string or file-like, default: None
        The string could be a URL. Valid URL schemes include http, ftp, s3,
        gcs, and file. For file URLs, a host is expected. For instance, a local
        file could be ``file://localhost/path/to/table.json``

    orient : string,
        Indication of expected JSON string format.
        Compatible JSON strings can be produced by ``to_json()`` with a
        corresponding orient value.
        The set of possible orients is:

        - ``'split'`` : dict like
          ``{index -> [index], columns -> [columns], data -> [values]}``
        - ``'records'`` : list like
          ``[{column -> value}, ... , {column -> value}]``
        - ``'index'`` : dict like ``{index -> {column -> value}}``
        - ``'columns'`` : dict like ``{column -> {index -> value}}``
        - ``'values'`` : just the values array

        The allowed and default values depend on the value
        of the `typ` parameter.

        * when ``typ == 'series'``,

          - allowed orients are ``{'split','records','index'}``
          - default is ``'index'``
          - The Series index must be unique for orient ``'index'``.

        * when ``typ == 'frame'``,

          - allowed orients are ``{'split','records','index',
            'columns','values', 'table'}``
          - default is ``'columns'``
          - The DataFrame index must be unique for orients ``'index'`` and
            ``'columns'``.
          - The DataFrame columns must be unique for orients ``'index'``,
            ``'columns'``, and ``'records'``.

        .. versionadded:: 0.23.0
           'table' as an allowed value for the ``orient`` argument

    typ : type of object to recover (series or frame), default 'frame'
    dtype : boolean or dict, default True
        If True, infer dtypes, if a dict of column to dtype, then use those,
        if False, then don't infer dtypes at all, applies only to the data.
    convert_axes : boolean, default True
        Try to convert the axes to the proper dtypes.
    convert_dates : boolean, default True
        List of columns to parse for dates; If True, then try to parse
        datelike columns default is True; a column label is datelike if

        * it ends with ``'_at'``,

        * it ends with ``'_time'``,

        * it begins with ``'timestamp'``,

        * it is ``'modified'``, or

        * it is ``'date'``

    keep_default_dates : boolean, default True
        If parsing dates, then parse the default datelike columns
    numpy : boolean, default False
        Direct decoding to numpy arrays. Supports numeric data only, but
        non-numeric column and index labels are supported. Note also that the
        JSON ordering MUST be the same for each term if numpy=True.
    precise_float : boolean, default False
        Set to enable usage of higher precision (strtod) function when
        decoding string to double values. Default (False) is to use fast but
        less precise builtin functionality
    date_unit : string, default None
        The timestamp unit to detect if converting dates. The default behaviour
        is to try and detect the correct precision, but if this is not desired
        then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
        milliseconds, microseconds or nanoseconds respectively.
    encoding : str, default is 'utf-8'
        The encoding to use to decode py3 bytes.

        .. versionadded:: 0.19.0

    lines : boolean, default False
        Read the file as a json object per line.

        .. versionadded:: 0.19.0

    chunksize : integer, default None
        Return JsonReader object for iteration.
        See the `line-delimted json docs
        <http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
        for more information on ``chunksize``.
        This can only be passed if `lines=True`.
        If this is None, the file will be read into memory all at once.

        .. versionadded:: 0.21.0

    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, zip or xz if path_or_buf is a string ending in
        '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
        otherwise. If using 'zip', the ZIP file must contain only one data
        file to be read in. Set to None for no decompression.

        .. versionadded:: 0.21.0

    Returns
    -------
    result : Series or DataFrame, depending on the value of `typ`.

    See Also
    --------
    DataFrame.to_json

    Notes
    -----
    Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
    :class:`Index` name of `index` gets written with :func:`to_json`, the
    subsequent read operation will incorrectly set the :class:`Index` name to
    ``None``. This is because `index` is also used by :func:`DataFrame.to_json`
    to denote a missing :class:`Index` name, and the subsequent
    :func:`read_json` operation cannot distinguish between the two. The same
    limitation is encountered with a :class:`MultiIndex` and any names
    beginning with ``'level_'``.

    Examples
    --------

    >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
    ...                   index=['row 1', 'row 2'],
    ...                   columns=['col 1', 'col 2'])

    Encoding/decoding a Dataframe using ``'split'`` formatted JSON:

    >>> df.to_json(orient='split')
    '{"columns":["col 1","col 2"],
      "index":["row 1","row 2"],
      "data":[["a","b"],["c","d"]]}'
    >>> pd.read_json(_, orient='split')
          col 1 col 2
    row 1     a     b
    row 2     c     d

    Encoding/decoding a Dataframe using ``'index'`` formatted JSON:

    >>> df.to_json(orient='index')
    '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
    >>> pd.read_json(_, orient='index')
          col 1 col 2
    row 1     a     b
    row 2     c     d

    Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
    Note that index labels are not preserved with this encoding.

    >>> df.to_json(orient='records')
    '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
    >>> pd.read_json(_, orient='records')
      col 1 col 2
    0     a     b
    1     c     d

    Encoding with Table Schema

    >>> df.to_json(orient='table')
    '{"schema": {"fields": [{"name": "index", "type": "string"},
                            {"name": "col 1", "type": "string"},
                            {"name": "col 2", "type": "string"}],
                    "primaryKey": "index",
                    "pandas_version": "0.20.0"},
        "data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
                {"index": "row 2", "col 1": "c", "col 2": "d"}]}'
    """

    compression = _infer_compression(path_or_buf, compression)
    filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
        path_or_buf, encoding=encoding, compression=compression,
    )

    json_reader = JsonReader(
        filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
        convert_axes=convert_axes, convert_dates=convert_dates,
        keep_default_dates=keep_default_dates, numpy=numpy,
        precise_float=precise_float, date_unit=date_unit, encoding=encoding,
        lines=lines, chunksize=chunksize, compression=compression,
    )

    if chunksize:
        return json_reader

    result = json_reader.read()
    if should_close:
        try:
            filepath_or_buffer.close()
        except:  # noqa: flake8
            pass
    return result
Beispiel #17
0
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
              convert_axes=True, convert_dates=True, keep_default_dates=True,
              numpy=False, precise_float=False, date_unit=None, encoding=None,
              lines=False, chunksize=None, compression='infer'):
    """
    Convert a JSON string to pandas object

    Parameters
    ----------
    path_or_buf : a valid JSON string or file-like, default: None
        The string could be a URL. Valid URL schemes include http, ftp, s3,
        gcs, and file. For file URLs, a host is expected. For instance, a local
        file could be ``file://localhost/path/to/table.json``

    orient : string,
        Indication of expected JSON string format.
        Compatible JSON strings can be produced by ``to_json()`` with a
        corresponding orient value.
        The set of possible orients is:

        - ``'split'`` : dict like
          ``{index -> [index], columns -> [columns], data -> [values]}``
        - ``'records'`` : list like
          ``[{column -> value}, ... , {column -> value}]``
        - ``'index'`` : dict like ``{index -> {column -> value}}``
        - ``'columns'`` : dict like ``{column -> {index -> value}}``
        - ``'values'`` : just the values array

        The allowed and default values depend on the value
        of the `typ` parameter.

        * when ``typ == 'series'``,

          - allowed orients are ``{'split','records','index'}``
          - default is ``'index'``
          - The Series index must be unique for orient ``'index'``.

        * when ``typ == 'frame'``,

          - allowed orients are ``{'split','records','index',
            'columns','values', 'table'}``
          - default is ``'columns'``
          - The DataFrame index must be unique for orients ``'index'`` and
            ``'columns'``.
          - The DataFrame columns must be unique for orients ``'index'``,
            ``'columns'``, and ``'records'``.

        .. versionadded:: 0.23.0
           'table' as an allowed value for the ``orient`` argument

    typ : type of object to recover (series or frame), default 'frame'
    dtype : boolean or dict, default True
        If True, infer dtypes, if a dict of column to dtype, then use those,
        if False, then don't infer dtypes at all, applies only to the data.
    convert_axes : boolean, default True
        Try to convert the axes to the proper dtypes.
    convert_dates : boolean, default True
        List of columns to parse for dates; If True, then try to parse
        datelike columns default is True; a column label is datelike if

        * it ends with ``'_at'``,

        * it ends with ``'_time'``,

        * it begins with ``'timestamp'``,

        * it is ``'modified'``, or

        * it is ``'date'``

    keep_default_dates : boolean, default True
        If parsing dates, then parse the default datelike columns
    numpy : boolean, default False
        Direct decoding to numpy arrays. Supports numeric data only, but
        non-numeric column and index labels are supported. Note also that the
        JSON ordering MUST be the same for each term if numpy=True.
    precise_float : boolean, default False
        Set to enable usage of higher precision (strtod) function when
        decoding string to double values. Default (False) is to use fast but
        less precise builtin functionality
    date_unit : string, default None
        The timestamp unit to detect if converting dates. The default behaviour
        is to try and detect the correct precision, but if this is not desired
        then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
        milliseconds, microseconds or nanoseconds respectively.
    lines : boolean, default False
        Read the file as a json object per line.

        .. versionadded:: 0.19.0

    encoding : str, default is 'utf-8'
        The encoding to use to decode py3 bytes.

        .. versionadded:: 0.19.0

    chunksize: integer, default None
        Return JsonReader object for iteration.
        See the `line-delimted json docs
        <http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
        for more information on ``chunksize``.
        This can only be passed if `lines=True`.
        If this is None, the file will be read into memory all at once.

        .. versionadded:: 0.21.0

    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, zip or xz if path_or_buf is a string ending in
        '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
        otherwise. If using 'zip', the ZIP file must contain only one data
        file to be read in. Set to None for no decompression.

        .. versionadded:: 0.21.0

    Returns
    -------
    result : Series or DataFrame, depending on the value of `typ`.

    Notes
    -----
    Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
    :class:`Index` name of `index` gets written with :func:`to_json`, the
    subsequent read operation will incorrectly set the :class:`Index` name to
    ``None``. This is because `index` is also used by :func:`DataFrame.to_json`
    to denote a missing :class:`Index` name, and the subsequent
    :func:`read_json` operation cannot distinguish between the two. The same
    limitation is encountered with a :class:`MultiIndex` and any names
    beginning with ``'level_'``.

    See Also
    --------
    DataFrame.to_json

    Examples
    --------

    >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
    ...                   index=['row 1', 'row 2'],
    ...                   columns=['col 1', 'col 2'])

    Encoding/decoding a Dataframe using ``'split'`` formatted JSON:

    >>> df.to_json(orient='split')
    '{"columns":["col 1","col 2"],
      "index":["row 1","row 2"],
      "data":[["a","b"],["c","d"]]}'
    >>> pd.read_json(_, orient='split')
          col 1 col 2
    row 1     a     b
    row 2     c     d

    Encoding/decoding a Dataframe using ``'index'`` formatted JSON:

    >>> df.to_json(orient='index')
    '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
    >>> pd.read_json(_, orient='index')
          col 1 col 2
    row 1     a     b
    row 2     c     d

    Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
    Note that index labels are not preserved with this encoding.

    >>> df.to_json(orient='records')
    '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
    >>> pd.read_json(_, orient='records')
      col 1 col 2
    0     a     b
    1     c     d

    Encoding with Table Schema

    >>> df.to_json(orient='table')
    '{"schema": {"fields": [{"name": "index", "type": "string"},
                            {"name": "col 1", "type": "string"},
                            {"name": "col 2", "type": "string"}],
                    "primaryKey": "index",
                    "pandas_version": "0.20.0"},
        "data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
                {"index": "row 2", "col 1": "c", "col 2": "d"}]}'
    """

    compression = _infer_compression(path_or_buf, compression)
    filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
        path_or_buf, encoding=encoding, compression=compression,
    )

    json_reader = JsonReader(
        filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
        convert_axes=convert_axes, convert_dates=convert_dates,
        keep_default_dates=keep_default_dates, numpy=numpy,
        precise_float=precise_float, date_unit=date_unit, encoding=encoding,
        lines=lines, chunksize=chunksize, compression=compression,
    )

    if chunksize:
        return json_reader

    result = json_reader.read()
    if should_close:
        try:
            filepath_or_buffer.close()
        except:  # noqa: flake8
            pass
    return result
Beispiel #18
0
def read_pickle(path, compression='infer'):
    """
    Load pickled pandas object (or any other pickled object) from the specified
    file path

    Warning: Loading pickled data received from untrusted sources can be
    unsafe. See: http://docs.python.org/2.7/library/pickle.html

    Parameters
    ----------
    path : string
        File path
    compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz',
        or 'zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

        .. versionadded:: 0.20.0

    Returns
    -------
    unpickled : type of object stored in file
    """

    inferred_compression = _infer_compression(path, compression)

    def read_wrapper(func):
        # wrapper file handle open/close operation
        f, fh = _get_handle(path, 'rb',
                            compression=inferred_compression,
                            is_text=False)
        try:
            return func(f)
        finally:
            for _f in fh:
                _f.close()

    def try_read(path, encoding=None):
        # try with cPickle
        # try with current pickle, if we have a Type Error then
        # try with the compat pickle to handle subclass changes
        # pass encoding only if its not None as py2 doesn't handle
        # the param

        # cpickle
        # GH 6899
        try:
            return read_wrapper(lambda f: pkl.load(f))
        except Exception:
            # reg/patched pickle
            try:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=False))
            # compat pickle
            except:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=True))
    try:
        return try_read(path)
    except:
        if PY3:
            return try_read(path, encoding='latin1')
        raise
Beispiel #19
0
    def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
                 float_format=None, cols=None, header=True, index=True,
                 index_label=None, mode='w', nanRep=None, encoding=None,
                 compression='infer', quoting=None, line_terminator='\n',
                 chunksize=None, tupleize_cols=False, quotechar='"',
                 date_format=None, doublequote=True, escapechar=None,
                 decimal='.'):

        self.obj = obj

        if path_or_buf is None:
            path_or_buf = StringIO()

        self.path_or_buf, _, _, _ = get_filepath_or_buffer(
            path_or_buf, encoding=encoding, compression=compression, mode=mode
        )
        self.sep = sep
        self.na_rep = na_rep
        self.float_format = float_format
        self.decimal = decimal

        self.header = header
        self.index = index
        self.index_label = index_label
        self.mode = mode
        if encoding is None:
            encoding = 'utf-8'
        self.encoding = encoding
        self.compression = _infer_compression(self.path_or_buf, compression)

        if quoting is None:
            quoting = csvlib.QUOTE_MINIMAL
        self.quoting = quoting

        if quoting == csvlib.QUOTE_NONE:
            # prevents crash in _csv
            quotechar = None
        self.quotechar = quotechar

        self.doublequote = doublequote
        self.escapechar = escapechar

        self.line_terminator = line_terminator or os.linesep

        self.date_format = date_format

        self.tupleize_cols = tupleize_cols
        self.has_mi_columns = (isinstance(obj.columns, ABCMultiIndex) and
                               not self.tupleize_cols)

        # validate mi options
        if self.has_mi_columns:
            if cols is not None:
                raise TypeError("cannot specify cols with a MultiIndex on the "
                                "columns")

        if cols is not None:
            if isinstance(cols, ABCIndexClass):
                cols = cols.to_native_types(na_rep=na_rep,
                                            float_format=float_format,
                                            date_format=date_format,
                                            quoting=self.quoting)
            else:
                cols = list(cols)
            self.obj = self.obj.loc[:, cols]

        # update columns to include possible multiplicity of dupes
        # and make sure sure cols is just a list of labels
        cols = self.obj.columns
        if isinstance(cols, ABCIndexClass):
            cols = cols.to_native_types(na_rep=na_rep,
                                        float_format=float_format,
                                        date_format=date_format,
                                        quoting=self.quoting)
        else:
            cols = list(cols)

        # save it
        self.cols = cols

        # preallocate data 2d list
        self.blocks = self.obj._data.blocks
        ncols = sum(b.shape[0] for b in self.blocks)
        self.data = [None] * ncols

        if chunksize is None:
            chunksize = (100000 // (len(self.cols) or 1)) or 1
        self.chunksize = int(chunksize)

        self.data_index = obj.index
        if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and
                date_format is not None):
            from pandas import Index
            self.data_index = Index([x.strftime(date_format) if notna(x) else
                                     '' for x in self.data_index])

        self.nlevels = getattr(self.data_index, 'nlevels', 1)
        if not index:
            self.nlevels = 0
Beispiel #20
0
def read_csv(
    filepath_or_buffer,
    sep=",",
    delimiter=None,
    header="infer",
    names=None,
    index_col=None,
    usecols=None,
    squeeze=False,
    prefix=None,
    mangle_dupe_cols=True,
    dtype=None,
    engine=None,
    converters=None,
    true_values=None,
    false_values=None,
    skipinitialspace=False,
    skiprows=None,
    nrows=None,
    na_values=None,
    keep_default_na=True,
    na_filter=True,
    verbose=False,
    skip_blank_lines=True,
    parse_dates=False,
    infer_datetime_format=False,
    keep_date_col=False,
    date_parser=None,
    dayfirst=False,
    iterator=False,
    chunksize=None,
    compression="infer",
    thousands=None,
    decimal=b".",
    lineterminator=None,
    quotechar='"',
    quoting=0,
    escapechar=None,
    comment=None,
    encoding=None,
    dialect=None,
    tupleize_cols=None,
    error_bad_lines=True,
    warn_bad_lines=True,
    skipfooter=0,
    doublequote=True,
    delim_whitespace=False,
    low_memory=True,
    memory_map=False,
    float_precision=None,
):
    """Read csv file from local disk.
    Args:
        filepath:
              The filepath of the csv file.
              We only support local files for now.
        kwargs: Keyword arguments in pandas::from_csv
    """
    # The intention of the inspection code is to reduce the amount of
    # communication we have to do between processes and nodes. We take a quick
    # pass over the arguments and remove those that are default values so we
    # don't have to serialize and send them to the workers. Because the
    # arguments list is so long, this does end up saving time based on the
    # number of nodes in the cluster.
    frame = inspect.currentframe()
    _, _, _, kwargs = inspect.getargvalues(frame)
    try:
        args, _, _, defaults, _, _, _ = inspect.getfullargspec(read_csv)
        defaults = dict(zip(args[1:], defaults))
        kwargs = {
            kw: kwargs[kw]
            for kw in kwargs if kw in defaults and kwargs[kw] != defaults[kw]
        }
    # This happens on Python2, we will just default to serializing the entire dictionary
    except AttributeError:
        # We suppress the error and delete the kwargs not needed in the remote function.
        del kwargs["filepath_or_buffer"]
        del kwargs["frame"]

    if isinstance(filepath_or_buffer, str):
        if not os.path.exists(filepath_or_buffer):
            warnings.warn(
                "File not found on disk. Defaulting to Pandas implementation.",
                UserWarning,
            )
            return _read_csv_from_pandas(filepath_or_buffer, kwargs)
    elif not isinstance(filepath_or_buffer, py.path.local):
        read_from_pandas = True
        # Pandas read_csv supports pathlib.Path
        try:
            import pathlib

            if isinstance(filepath_or_buffer, pathlib.Path):
                read_from_pandas = False
        except ImportError:
            pass
        if read_from_pandas:
            warnings.warn(
                "Reading from buffer. Defaulting to Pandas implementation.",
                UserWarning)
            return _read_csv_from_pandas(filepath_or_buffer, kwargs)
    if _infer_compression(filepath_or_buffer, compression) is not None:
        warnings.warn(
            "Compression detected. Defaulting to Pandas implementation.",
            UserWarning)
        return _read_csv_from_pandas(filepath_or_buffer, kwargs)
    if chunksize is not None:
        warnings.warn(
            "Reading chunks from a file. Defaulting to Pandas implementation.",
            UserWarning,
        )
        return _read_csv_from_pandas(filepath_or_buffer, kwargs)
    if skiprows is not None and not isinstance(skiprows, int):
        warnings.warn(("Defaulting to Pandas implementation. To speed up "
                       "read_csv through the Modin implementation, "
                       "comment the rows to skip instead."))
        return _read_csv_from_pandas(filepath_or_buffer, kwargs)
    # TODO: replace this by reading lines from file.
    if nrows is not None:
        warnings.warn("Defaulting to Pandas implementation.", UserWarning)
        return _read_csv_from_pandas(filepath_or_buffer, kwargs)
    else:
        return _read_csv_from_file_pandas_on_ray(filepath_or_buffer, kwargs)
Beispiel #21
0
    def _read(cls, filepath_or_buffer, **kwargs):
        """Read csv file from local disk.
        Args:
            filepath_or_buffer:
                  The filepath of the csv file.
                  We only support local files for now.
            kwargs: Keyword arguments in pandas.read_csv
        """
        # The intention of the inspection code is to reduce the amount of
        # communication we have to do between processes and nodes. We take a quick
        # pass over the arguments and remove those that are default values so we
        # don't have to serialize and send them to the workers. Because the
        # arguments list is so long, this does end up saving time based on the
        # number of nodes in the cluster.
        try:
            args, _, _, defaults, _, _, _ = inspect.getfullargspec(
                cls.read_csv)
            defaults = dict(zip(args[2:], defaults))
            filtered_kwargs = {
                kw: kwargs[kw]
                for kw in kwargs if kw in defaults
                and not isinstance(kwargs[kw], type(defaults[kw]))
                or kwargs[kw] != defaults[kw]
            }
        # This happens on Python2, we will just default to serializing the entire dictionary
        except AttributeError:
            filtered_kwargs = kwargs

        if isinstance(filepath_or_buffer, str):
            if not os.path.exists(filepath_or_buffer):
                ErrorMessage.default_to_pandas("File not found on disk")
                return cls._read_csv_from_pandas(filepath_or_buffer,
                                                 filtered_kwargs)
        elif not isinstance(filepath_or_buffer, py.path.local):
            read_from_pandas = True
            # Pandas read_csv supports pathlib.Path
            try:
                import pathlib

                if isinstance(filepath_or_buffer, pathlib.Path):
                    read_from_pandas = False
            except ImportError:
                pass
            if read_from_pandas:
                ErrorMessage.default_to_pandas("Reading from buffer.")
                return cls._read_csv_from_pandas(filepath_or_buffer, kwargs)
        if (_infer_compression(filepath_or_buffer, kwargs.get("compression"))
                is not None):
            ErrorMessage.default_to_pandas("Compression detected.")
            return cls._read_csv_from_pandas(filepath_or_buffer,
                                             filtered_kwargs)

        chunksize = kwargs.get("chunksize")
        if chunksize is not None:
            ErrorMessage.default_to_pandas("Reading chunks from a file.")
            return cls._read_csv_from_pandas(filepath_or_buffer,
                                             filtered_kwargs)

        skiprows = kwargs.get("skiprows")
        if skiprows is not None and not isinstance(skiprows, int):
            ErrorMessage.default_to_pandas(
                "skiprows parameter not optimized yet.")
            return cls._read_csv_from_pandas(filepath_or_buffer, kwargs)
        # TODO: replace this by reading lines from file.
        if kwargs.get("nrows") is not None:
            ErrorMessage.default_to_pandas("`read_csv` with `nrows`")
            return cls._read_csv_from_pandas(filepath_or_buffer,
                                             filtered_kwargs)
        else:
            return cls._read_csv_from_file_pandas_on_ray(
                filepath_or_buffer, filtered_kwargs)
Beispiel #22
0
    def __init__(self,
                 obj,
                 path_or_buf=None,
                 sep=",",
                 na_rep='',
                 float_format=None,
                 cols=None,
                 header=True,
                 index=True,
                 index_label=None,
                 mode='w',
                 encoding=None,
                 compression='infer',
                 quoting=None,
                 line_terminator='\n',
                 chunksize=None,
                 quotechar='"',
                 date_format=None,
                 doublequote=True,
                 escapechar=None,
                 decimal='.'):

        self.obj = obj

        if path_or_buf is None:
            path_or_buf = StringIO()

        self.path_or_buf, _, _, _ = get_filepath_or_buffer(
            path_or_buf, encoding=encoding, compression=compression, mode=mode)
        self.sep = sep
        self.na_rep = na_rep
        self.float_format = float_format
        self.decimal = decimal

        self.header = header
        self.index = index
        self.index_label = index_label
        self.mode = mode
        if encoding is None:
            encoding = 'utf-8'
        self.encoding = encoding
        self.compression = _infer_compression(self.path_or_buf, compression)

        if quoting is None:
            quoting = csvlib.QUOTE_MINIMAL
        self.quoting = quoting

        if quoting == csvlib.QUOTE_NONE:
            # prevents crash in _csv
            quotechar = None
        self.quotechar = quotechar

        self.doublequote = doublequote
        self.escapechar = escapechar

        self.line_terminator = line_terminator or os.linesep

        self.date_format = date_format

        self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex)

        # validate mi options
        if self.has_mi_columns:
            if cols is not None:
                raise TypeError("cannot specify cols with a MultiIndex on the "
                                "columns")

        if cols is not None:
            if isinstance(cols, ABCIndexClass):
                cols = cols.to_native_types(na_rep=na_rep,
                                            float_format=float_format,
                                            date_format=date_format,
                                            quoting=self.quoting)
            else:
                cols = list(cols)
            self.obj = self.obj.loc[:, cols]

        # update columns to include possible multiplicity of dupes
        # and make sure sure cols is just a list of labels
        cols = self.obj.columns
        if isinstance(cols, ABCIndexClass):
            cols = cols.to_native_types(na_rep=na_rep,
                                        float_format=float_format,
                                        date_format=date_format,
                                        quoting=self.quoting)
        else:
            cols = list(cols)

        # save it
        self.cols = cols

        # preallocate data 2d list
        self.blocks = self.obj._data.blocks
        ncols = sum(b.shape[0] for b in self.blocks)
        self.data = [None] * ncols

        if chunksize is None:
            chunksize = (100000 // (len(self.cols) or 1)) or 1
        self.chunksize = int(chunksize)

        self.data_index = obj.index
        if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex))
                and date_format is not None):
            from pandas import Index
            self.data_index = Index([
                x.strftime(date_format) if notna(x) else ''
                for x in self.data_index
            ])

        self.nlevels = getattr(self.data_index, 'nlevels', 1)
        if not index:
            self.nlevels = 0