コード例 #1
0
ファイル: parquet.py プロジェクト: bwignall/pandas
    def write(self, df, path, compression='snappy', index=None,
              partition_cols=None, **kwargs):
        self.validate_dataframe(df)
        # thriftpy/protocol/compact.py:339:
        # DeprecationWarning: tostring() is deprecated.
        # Use tobytes() instead.

        if 'partition_on' in kwargs and partition_cols is not None:
            raise ValueError("Cannot use both partition_on and "
                             "partition_cols. Use partition_cols for "
                             "partitioning data")
        elif 'partition_on' in kwargs:
            partition_cols = kwargs.pop('partition_on')

        if partition_cols is not None:
            kwargs['file_scheme'] = 'hive'

        if is_s3_url(path):
            # path is s3:// so we need to open the s3file in 'wb' mode.
            # TODO: Support 'ab'

            path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
            # And pass the opened s3file to the fastparquet internal impl.
            kwargs['open_with'] = lambda path, _: path
        else:
            path, _, _, _ = get_filepath_or_buffer(path)

        with catch_warnings(record=True):
            self.api.write(path, df, compression=compression,
                           write_index=index, partition_on=partition_cols,
                           **kwargs)
コード例 #2
0
ファイル: parquet.py プロジェクト: jess010/pandas
    def read(self, path, columns=None, **kwargs):
        if is_s3_url(path):
            # When path is s3:// an S3File is returned.
            # We need to retain the original path(str) while also
            # pass the S3File().open function to fsatparquet impl.
            s3, _, _ = get_filepath_or_buffer(path)
            parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open)
        else:
            path, _, _ = get_filepath_or_buffer(path)
            parquet_file = self.api.ParquetFile(path)

        return parquet_file.to_pandas(columns=columns, **kwargs)
コード例 #3
0
ファイル: parquet.py プロジェクト: bkandel/pandas
    def write(self, df, path, compression='snappy',
              coerce_timestamps='ms', index=None, **kwargs):
        self.validate_dataframe(df)

        # Only validate the index if we're writing it.
        if self._pyarrow_lt_070 and index is not False:
            self._validate_write_lt_070(df)
        path, _, _, _ = get_filepath_or_buffer(path, mode='wb')

        if index is None:
            from_pandas_kwargs = {}
        else:
            from_pandas_kwargs = {'preserve_index': index}

        if self._pyarrow_lt_060:
            table = self.api.Table.from_pandas(df, timestamps_to_ms=True,
                                               **from_pandas_kwargs)
            self.api.parquet.write_table(
                table, path, compression=compression, **kwargs)

        else:
            table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
            self.api.parquet.write_table(
                table, path, compression=compression,
                coerce_timestamps=coerce_timestamps, **kwargs)
コード例 #4
0
ファイル: sas7bdat.py プロジェクト: MattRijk/pandas
    def __init__(self, path_or_buf, index=None, convert_dates=True,
                 blank_missing=True, chunksize=None, encoding=None,
                 convert_text=True, convert_header_text=True):

        self.index = index
        self.convert_dates = convert_dates
        self.blank_missing = blank_missing
        self.chunksize = chunksize
        self.encoding = encoding
        self.convert_text = convert_text
        self.convert_header_text = convert_header_text

        self.default_encoding = "latin-1"
        self.compression = ""
        self.column_names_strings = []
        self.column_names = []
        self.column_types = []
        self.column_formats = []
        self.columns = []

        self._current_page_data_subheader_pointers = []
        self._cached_page = None
        self._column_data_lengths = []
        self._column_data_offsets = []
        self._current_row_in_file_index = 0
        self._current_row_on_page_index = 0
        self._current_row_in_file_index = 0

        self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
        if isinstance(self._path_or_buf, compat.string_types):
            self._path_or_buf = open(self._path_or_buf, 'rb')
            self.handle = self._path_or_buf

        self._get_properties()
        self._parse_metadata()
コード例 #5
0
ファイル: packers.py プロジェクト: bwignall/pandas
def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
    """
    Load msgpack pandas object from the specified
    file path

    THIS IS AN EXPERIMENTAL LIBRARY and the storage format
    may not be stable until a future release.

    Parameters
    ----------
    path_or_buf : string File path, BytesIO like or string
    encoding : Encoding for decoding msgpack str type
    iterator : boolean, if True, return an iterator to the unpacker
               (default is False)

    Returns
    -------
    obj : same type as object stored in file
    """
    path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)
    if iterator:
        return Iterator(path_or_buf)

    def read(fh):
        unpacked_obj = list(unpack(fh, encoding=encoding, **kwargs))
        if len(unpacked_obj) == 1:
            return unpacked_obj[0]

        if should_close:
            try:
                path_or_buf.close()
            except IOError:
                pass
        return unpacked_obj

    # see if we have an actual file
    if isinstance(path_or_buf, str):
        try:
            exists = os.path.exists(path_or_buf)
        except (TypeError, ValueError):
            exists = False

        if exists:
            with open(path_or_buf, 'rb') as fh:
                return read(fh)

    if isinstance(path_or_buf, bytes):
        # treat as a binary-like
        fh = None
        try:
            fh = BytesIO(path_or_buf)
            return read(fh)
        finally:
            if fh is not None:
                fh.close()
    elif hasattr(path_or_buf, 'read') and callable(path_or_buf.read):
        # treat as a buffer like
        return read(path_or_buf)

    raise ValueError('path_or_buf needs to be a string file path or file-like')
コード例 #6
0
ファイル: sas_xport.py プロジェクト: TomAugspurger/pandas
    def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
                 chunksize=None):

        self._encoding = encoding
        self._lines_read = 0
        self._index = index
        self._chunksize = chunksize

        if isinstance(filepath_or_buffer, str):
            (filepath_or_buffer, encoding,
             compression, should_close) = get_filepath_or_buffer(
                filepath_or_buffer, encoding=encoding)

        if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)):
            self.filepath_or_buffer = open(filepath_or_buffer, 'rb')
        else:
            # Copy to BytesIO, and ensure no encoding
            contents = filepath_or_buffer.read()
            try:
                contents = contents.encode(self._encoding)
            except UnicodeEncodeError:
                pass
            self.filepath_or_buffer = compat.BytesIO(contents)

        self._read_header()
コード例 #7
0
ファイル: packers.py プロジェクト: dmjvictory/pandas
def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
    """
    Load msgpack pandas object from the specified
    file path

    THIS IS AN EXPERIMENTAL LIBRARY and the storage format
    may not be stable until a future release.

    Parameters
    ----------
    path_or_buf : string File path, BytesIO like or string
    encoding: Encoding for decoding msgpack str type
    iterator : boolean, if True, return an iterator to the unpacker
               (default is False)

    Returns
    -------
    obj : type of object stored in file

    """
    path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
    if iterator:
        return Iterator(path_or_buf)

    def read(fh):
        l = list(unpack(fh, encoding=encoding, **kwargs))
        if len(l) == 1:
            return l[0]
        return l

    # see if we have an actual file
    if isinstance(path_or_buf, compat.string_types):
        try:
            exists = os.path.exists(path_or_buf)
        except (TypeError, ValueError):
            exists = False

        if exists:
            with open(path_or_buf, 'rb') as fh:
                return read(fh)

    if isinstance(path_or_buf, compat.binary_type):
        # treat as a binary-like
        fh = None
        try:
            # We can't distinguish between a path and a buffer of bytes in
            # Python 2 so instead assume the first byte of a valid path is
            # less than 0x80.
            if compat.PY3 or ord(path_or_buf[0]) >= 0x80:
                fh = compat.BytesIO(path_or_buf)
                return read(fh)
        finally:
            if fh is not None:
                fh.close()
    elif hasattr(path_or_buf, 'read') and compat.callable(path_or_buf.read):
        # treat as a buffer like
        return read(path_or_buf)

    raise ValueError('path_or_buf needs to be a string file path or file-like')
コード例 #8
0
ファイル: test_common.py プロジェクト: TomAugspurger/pandas
 def test_get_filepath_or_buffer_with_path(self):
     filename = '~/sometest'
     filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
         filename)
     assert filepath_or_buffer != filename
     assert os.path.isabs(filepath_or_buffer)
     assert os.path.expanduser(filename) == filepath_or_buffer
     assert not should_close
コード例 #9
0
ファイル: parquet.py プロジェクト: dmjvictory/pandas
 def read(self, path, columns=None, **kwargs):
     path, _, _ = get_filepath_or_buffer(path)
     if self._pyarrow_lt_070:
         return self.api.parquet.read_pandas(path, columns=columns,
                                             **kwargs).to_pandas()
     kwargs['use_pandas_metadata'] = True
     return self.api.parquet.read_table(path, columns=columns,
                                        **kwargs).to_pandas()
コード例 #10
0
ファイル: parquet.py プロジェクト: Xbar/pandas
 def write(self, df, path, compression='snappy', **kwargs):
     # thriftpy/protocol/compact.py:339:
     # DeprecationWarning: tostring() is deprecated.
     # Use tobytes() instead.
     path, _, _ = get_filepath_or_buffer(path)
     with catch_warnings(record=True):
         self.api.write(path, df,
                        compression=compression, **kwargs)
コード例 #11
0
ファイル: packers.py プロジェクト: tserafim/pandas
def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs):
    """
    Load msgpack pandas object from the specified
    file path

    THIS IS AN EXPERIMENTAL LIBRARY and the storage format
    may not be stable until a future release.

    Parameters
    ----------
    path_or_buf : string File path, BytesIO like or string
    encoding: Encoding for decoding msgpack str type
    iterator : boolean, if True, return an iterator to the unpacker
               (default is False)

    Returns
    -------
    obj : type of object stored in file

    """
    path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
    if iterator:
        return Iterator(path_or_buf)

    def read(fh):
        l = list(unpack(fh, encoding=encoding, **kwargs))
        if len(l) == 1:
            return l[0]
        return l

    # see if we have an actual file
    if isinstance(path_or_buf, compat.string_types):

        try:
            exists = os.path.exists(path_or_buf)
        except (TypeError, ValueError):
            exists = False

        if exists:
            with open(path_or_buf, "rb") as fh:
                return read(fh)

    # treat as a binary-like
    if isinstance(path_or_buf, compat.binary_type):
        fh = None
        try:
            fh = compat.BytesIO(path_or_buf)
            return read(fh)
        finally:
            if fh is not None:
                fh.close()

    # a buffer like
    if hasattr(path_or_buf, "read") and compat.callable(path_or_buf.read):
        return read(path_or_buf)

    raise ValueError("path_or_buf needs to be a string file path or file-like")
コード例 #12
0
    def write(
        self,
        df: DataFrame,
        path,
        compression="snappy",
        index=None,
        partition_cols=None,
        **kwargs,
    ):
        self.validate_dataframe(df)
        # thriftpy/protocol/compact.py:339:
        # DeprecationWarning: tostring() is deprecated.
        # Use tobytes() instead.

        if "partition_on" in kwargs and partition_cols is not None:
            raise ValueError("Cannot use both partition_on and "
                             "partition_cols. Use partition_cols for "
                             "partitioning data")
        elif "partition_on" in kwargs:
            partition_cols = kwargs.pop("partition_on")

        if partition_cols is not None:
            kwargs["file_scheme"] = "hive"

        if is_s3_url(path) or is_gcs_url(path):
            # if path is s3:// or gs:// we need to open the file in 'wb' mode.
            # TODO: Support 'ab'

            path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
            # And pass the opened file to the fastparquet internal impl.
            kwargs["open_with"] = lambda path, _: path
        else:
            path, _, _, _ = get_filepath_or_buffer(path)

        with catch_warnings(record=True):
            self.api.write(
                path,
                df,
                compression=compression,
                write_index=index,
                partition_on=partition_cols,
                **kwargs,
            )
コード例 #13
0
    def read(self, path, columns=None, **kwargs):
        path, _, _, should_close = get_filepath_or_buffer(path)

        kwargs["use_pandas_metadata"] = True
        result = self.api.parquet.read_table(path, columns=columns,
                                             **kwargs).to_pandas()
        if should_close:
            path.close()

        return result
コード例 #14
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None):

    # Dictionaries are no longer considered valid inputs
    # for "get_filepath_or_buffer" starting in pandas >= 0.20.0
    if isinstance(filepath_or_buffer, dict):
        return filepath_or_buffer, encoding, compression

    return com.get_filepath_or_buffer(
        filepath_or_buffer, encoding=encoding, compression=None
    )
コード例 #15
0
    def write(self, df, path, compression='snappy', **kwargs):
        self.validate_dataframe(df)
        # thriftpy/protocol/compact.py:339:
        # DeprecationWarning: tostring() is deprecated.
        # Use tobytes() instead.

        if is_s3_url(path):
            # path is s3:// so we need to open the s3file in 'wb' mode.
            # TODO: Support 'ab'

            path, _, _ = get_filepath_or_buffer(path, mode='wb')
            # And pass the opened s3file to the fastparquet internal impl.
            kwargs['open_with'] = lambda path, _: path
        else:
            path, _, _ = get_filepath_or_buffer(path)

        with catch_warnings(record=True):
            self.api.write(path, df,
                           compression=compression, **kwargs)
コード例 #16
0
def read_feather(path,
                 columns=None,
                 use_threads: bool = True,
                 storage_options: StorageOptions = None):
    """
    Load a feather-format object from the file path.

    Parameters
    ----------
    path : str, path object or file-like object
        Any valid string path is acceptable. The string could be a URL. Valid
        URL schemes include http, ftp, s3, and file. For file URLs, a host is
        expected. A local file could be:
        ``file://localhost/path/to/table.feather``.

        If you want to pass in a path object, pandas accepts any
        ``os.PathLike``.

        By file-like object, we refer to objects with a ``read()`` method,
        such as a file handle (e.g. via builtin ``open`` function)
        or ``StringIO``.
    columns : sequence, default None
        If not provided, all columns are read.

        .. versionadded:: 0.24.0
    use_threads : bool, default True
        Whether to parallelize reading using multiple threads.

       .. versionadded:: 0.24.0
    storage_options : dict, optional
        Extra options that make sense for a particular storage connection, e.g.
        host, port, username, password, etc., if using a URL that will
        be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
        will be raised if providing this argument with a local path or
        a file-like buffer. See the fsspec and backend storage implementation
        docs for the set of allowed keys and values.

        .. versionadded:: 1.2.0

    Returns
    -------
    type of object stored in file
    """
    import_optional_dependency("pyarrow")
    from pyarrow import feather

    ioargs = get_filepath_or_buffer(path, storage_options=storage_options)

    df = feather.read_feather(ioargs.filepath_or_buffer,
                              columns=columns,
                              use_threads=bool(use_threads))

    ioargs.close()

    return df
コード例 #17
0
def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs):
    """
    Load msgpack pandas object from the specified
    file path

    Parameters
    ----------
    path_or_buf : string File path, BytesIO like or string
    encoding: Encoding for decoding msgpack str type
    iterator : boolean, if True, return an iterator to the unpacker
               (default is False)

    Returns
    -------
    obj : type of object stored in file

    """
    path_or_buf, *_ = get_filepath_or_buffer(path_or_buf)
    if iterator:
        return Iterator(path_or_buf)

    def read(fh):
        l = list(unpack(fh, encoding=encoding, **kwargs))
        if len(l) == 1:
            if isinstance(l[0], np.ndarray):
                return l[0].copy()
            return l[0]
        return l

    # see if we have an actual file
    if isinstance(path_or_buf, path_types):
        try:
            exists = os.path.exists(path_or_buf)
        except (TypeError, ValueError):
            exists = False

        if exists:
            with open(path_or_buf, "rb") as fh:
                return read(fh)

    # treat as a binary-like
    if isinstance(path_or_buf, binary_type):
        fh = None
        try:
            fh = BytesIO(path_or_buf)
            return read(fh)
        finally:
            if fh is not None:
                fh.close()

    # a buffer like
    if hasattr(path_or_buf, "read") and callable(path_or_buf.read):
        return read(path_or_buf)

    raise ValueError("path_or_buf needs to be a string file path or file-like")
コード例 #18
0
    def read(self, path, columns=None, **kwargs):
        if is_fsspec_url(path):
            fsspec = import_optional_dependency("fsspec")

            open_with = lambda path, _: fsspec.open(path, "rb").open()
            parquet_file = self.api.ParquetFile(path, open_with=open_with)
        else:
            path, _, _, _ = get_filepath_or_buffer(path)
            parquet_file = self.api.ParquetFile(path)

        return parquet_file.to_pandas(columns=columns, **kwargs)
コード例 #19
0
def read_msgpack(path_or_buf, iterator=False, **kwargs):
    """
    Load msgpack pandas object from the specified
    file path

    THIS IS AN EXPERIMENTAL LIBRARY and the storage format
    may not be stable until a future release.

    Parameters
    ----------
    path_or_buf : string File path, BytesIO like or string
    iterator : boolean, if True, return an iterator to the unpacker
               (default is False)

    Returns
    -------
    obj : type of object stored in file

    """
    _importers()
    path_or_buf, _ = get_filepath_or_buffer(path_or_buf)
    if iterator:
        return Iterator(path_or_buf)

    def read(fh):
        l = list(unpack(fh))
        if len(l) == 1:
            return l[0]
        return l

    # see if we have an actual file
    if isinstance(path_or_buf, compat.string_types):

        try:
            exists = os.path.exists(path_or_buf)
        except (TypeError, ValueError):
            exists = False

        if exists:
            with open(path_or_buf, 'rb') as fh:
                return read(fh)

    # treat as a string-like
    if not hasattr(path_or_buf, 'read'):

        try:
            fh = compat.BytesIO(path_or_buf)
            return read(fh)
        finally:
            fh.close()

    # a buffer like
    return read(path_or_buf)
コード例 #20
0
ファイル: packers.py プロジェクト: LeadSift/pandas
def read_msgpack(path_or_buf, iterator=False, **kwargs):
    """
    Load msgpack pandas object from the specified
    file path

    THIS IS AN EXPERIMENTAL LIBRARY and the storage format
    may not be stable until a future release.

    Parameters
    ----------
    path_or_buf : string File path, BytesIO like or string
    iterator : boolean, if True, return an iterator to the unpacker
               (default is False)

    Returns
    -------
    obj : type of object stored in file

    """
    _importers()
    path_or_buf, _ = get_filepath_or_buffer(path_or_buf)
    if iterator:
        return Iterator(path_or_buf)

    def read(fh):
        l = list(unpack(fh))
        if len(l) == 1:
            return l[0]
        return l

    # see if we have an actual file
    if isinstance(path_or_buf, compat.string_types):

        try:
            exists = os.path.exists(path_or_buf)
        except (TypeError,ValueError):
            exists = False

        if exists:
            with open(path_or_buf, 'rb') as fh:
                return read(fh)

    # treat as a string-like
    if not hasattr(path_or_buf, 'read'):

        try:
            fh = compat.BytesIO(path_or_buf)
            return read(fh)
        finally:
            fh.close()

    # a buffer like
    return read(path_or_buf)
コード例 #21
0
ファイル: sas7bdat.py プロジェクト: Iswaqasahmed/pandas-1
    def __init__(
        self,
        path_or_buf,
        index=None,
        convert_dates=True,
        blank_missing=True,
        chunksize=None,
        encoding=None,
        convert_text=True,
        convert_header_text=True,
    ):

        self.index = index
        self.convert_dates = convert_dates
        self.blank_missing = blank_missing
        self.chunksize = chunksize
        self.encoding = encoding
        self.convert_text = convert_text
        self.convert_header_text = convert_header_text

        self.default_encoding = "latin-1"
        self.compression = b""
        self.column_names_strings = []
        self.column_names = []
        self.column_formats = []
        self.columns = []

        self._current_page_data_subheader_pointers = []
        self._cached_page = None
        self._column_data_lengths = []
        self._column_data_offsets = []
        self._column_types = []

        self._current_row_in_file_index = 0
        self._current_row_on_page_index = 0
        self._current_row_in_file_index = 0

        path_or_buf = get_filepath_or_buffer(path_or_buf).filepath_or_buffer
        if isinstance(path_or_buf, str):
            buf = open(path_or_buf, "rb")
            self.handle = buf
        else:
            buf = path_or_buf

        self._path_or_buf: IO[Any] = buf

        try:
            self._get_properties()
            self._parse_metadata()
        except Exception:
            self.close()
            raise
コード例 #22
0
ファイル: parquet.py プロジェクト: kljp/MLPrac01
    def write(self, df, path, compression='snappy',
              coerce_timestamps='ms', **kwargs):
        path, _, _ = get_filepath_or_buffer(path)
        if self._pyarrow_lt_060:
            table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
            self.api.parquet.write_table(
                table, path, compression=compression, **kwargs)

        else:
            table = self.api.Table.from_pandas(df)
            self.api.parquet.write_table(
                table, path, compression=compression,
                coerce_timestamps=coerce_timestamps, **kwargs)
コード例 #23
0
    def write(self,
              df,
              path,
              compression='snappy',
              index=None,
              partition_cols=None,
              **kwargs):
        self.validate_dataframe(df)
        # thriftpy/protocol/compact.py:339:
        # DeprecationWarning: tostring() is deprecated.
        # Use tobytes() instead.

        if 'partition_on' in kwargs and partition_cols is not None:
            raise ValueError("Cannot use both partition_on and "
                             "partition_cols. Use partition_cols for "
                             "partitioning data")
        elif 'partition_on' in kwargs:
            partition_cols = kwargs.pop('partition_on')

        if partition_cols is not None:
            kwargs['file_scheme'] = 'hive'

        if is_s3_url(path):
            # path is s3:// so we need to open the s3file in 'wb' mode.
            # TODO: Support 'ab'

            path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
            # And pass the opened s3file to the fastparquet internal impl.
            kwargs['open_with'] = lambda path, _: path
        else:
            path, _, _, _ = get_filepath_or_buffer(path)

        with catch_warnings(record=True):
            self.api.write(path,
                           df,
                           compression=compression,
                           write_index=index,
                           partition_on=partition_cols,
                           **kwargs)
コード例 #24
0
ファイル: parquet.py プロジェクト: bwignall/pandas
    def read(self, path, columns=None, **kwargs):
        path, _, _, should_close = get_filepath_or_buffer(path)

        kwargs['use_pandas_metadata'] = True
        result = self.api.parquet.read_table(path, columns=columns,
                                             **kwargs).to_pandas()
        if should_close:
            try:
                path.close()
            except:  # noqa: flake8
                pass

        return result
コード例 #25
0
    def read(self, path, columns=None, **kwargs):
        path, _, _, should_close = get_filepath_or_buffer(path)

        kwargs['use_pandas_metadata'] = True
        result = self.api.parquet.read_table(path, columns=columns,
                                             **kwargs).to_pandas()
        if should_close:
            try:
                path.close()
            except:  # noqa: flake8
                pass

        return result
コード例 #26
0
ファイル: parquet.py プロジェクト: Xbar/pandas
    def write(self, df, path, compression='snappy',
              coerce_timestamps='ms', **kwargs):
        path, _, _ = get_filepath_or_buffer(path)
        if self._pyarrow_lt_060:
            table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
            self.api.parquet.write_table(
                table, path, compression=compression, **kwargs)

        else:
            table = self.api.Table.from_pandas(df)
            self.api.parquet.write_table(
                table, path, compression=compression,
                coerce_timestamps=coerce_timestamps, **kwargs)
コード例 #27
0
ファイル: json.py プロジェクト: AjayRamanathan/pandas
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True,
              parse_dates=False, keep_default_dates=True):
    """
    Convert JSON string to pandas object

    Parameters
    ----------
    filepath_or_buffer : a VALID JSON string or file handle / StringIO. The string could be
        a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host
        is expected. For instance, a local file could be
        file ://localhost/path/to/table.json
    orient : {'split', 'records', 'index'}, default 'index'
        The format of the JSON string
        split : dict like
            {index -> [index], name -> name, data -> [values]}
        records : list like [value, ... , value]
        index : dict like {index -> value}
    typ : type of object to recover (series or frame), default 'frame'
    dtype : dtype of the resulting object
    numpy: direct decoding to numpy arrays. default True but falls back
        to standard decoding if a problem occurs.
    parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns
        default is False
    keep_default_dates : boolean, default True. If parsing dates,
        then parse the default datelike columns

    Returns
    -------
    result : Series or DataFrame
    """

    filepath_or_buffer,_ = get_filepath_or_buffer(path_or_buf)
    if isinstance(filepath_or_buffer, basestring):
        if os.path.exists(filepath_or_buffer):
            with open(filepath_or_buffer,'r') as fh:
                json = fh.read()
        else:
            json = filepath_or_buffer
    elif hasattr(filepath_or_buffer, 'read'):
        json = filepath_or_buffer.read()
    else:
        json = filepath_or_buffer

    obj = None
    if typ == 'frame':
        obj = FrameParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse()

    if typ == 'series' or obj is None:
        obj = SeriesParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse()

    return obj
コード例 #28
0
    def from_json(cls, path_or_buf=None):
        """ Returns an ACN-Sim object loaded from in_registry.
        Note URLs have not been tested as path_or_buf input.

        Args:
            path_or_buf (Union[str, FilePathOrBuffer]): a valid JSON
                str, path object or file-like object. Any valid string
                path is acceptable.
        """
        # The code here is from pandas 1.0.1, io.json.from_json(), with
        # modifications.
        filepath_or_buffer, _, _, should_close = get_filepath_or_buffer(
            path_or_buf)

        exists = False
        if isinstance(filepath_or_buffer, str):
            try:
                exists = os.path.exists(filepath_or_buffer)
            except (TypeError, ValueError):
                pass

        if exists:
            filepath_or_buffer, _ = get_handle(filepath_or_buffer, "r")
            should_close = True

        if isinstance(filepath_or_buffer, str):
            should_close = False
            out_registry = json.loads(filepath_or_buffer)
        else:
            out_registry = json.load(filepath_or_buffer)
        if should_close:
            filepath_or_buffer.close()

        if out_registry["version"] is None:
            warnings.warn(
                f"Missing a recorded version of acnportal in the "
                f"loaded registry. Object may have been dumped with a "
                f"different version of acnportal.",
                UserWarning,
            )
        if out_registry["dependency_versions"] is None:
            warnings.warn(
                f"Missing recorded dependency versions of acnportal in "
                f"the loaded registry. Object may have been dumped "
                f"with different dependency versions of acnportal.",
                UserWarning,
            )

        out_obj = cls._from_registry(out_registry)[0]
        return out_obj
コード例 #29
0
ファイル: parquet.py プロジェクト: ziggi0703/pandas
    def write(self, df, path, compression='snappy',
              coerce_timestamps='ms', index=None, **kwargs):
        self.validate_dataframe(df)
        path, _, _, _ = get_filepath_or_buffer(path, mode='wb')

        if index is None:
            from_pandas_kwargs = {}
        else:
            from_pandas_kwargs = {'preserve_index': index}

        table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
        self.api.parquet.write_table(
            table, path, compression=compression,
            coerce_timestamps=coerce_timestamps, **kwargs)
コード例 #30
0
ファイル: parquet.py プロジェクト: wvigiloliver/pandas
    def write(
        self,
        df: DataFrame,
        path,
        compression="snappy",
        index=None,
        partition_cols=None,
        storage_options: StorageOptions = None,
        **kwargs,
    ):
        self.validate_dataframe(df)
        # thriftpy/protocol/compact.py:339:
        # DeprecationWarning: tostring() is deprecated.
        # Use tobytes() instead.

        if "partition_on" in kwargs and partition_cols is not None:
            raise ValueError(
                "Cannot use both partition_on and "
                "partition_cols. Use partition_cols for partitioning data"
            )
        elif "partition_on" in kwargs:
            partition_cols = kwargs.pop("partition_on")

        if partition_cols is not None:
            kwargs["file_scheme"] = "hive"

        if is_fsspec_url(path):
            fsspec = import_optional_dependency("fsspec")

            # if filesystem is provided by fsspec, file must be opened in 'wb' mode.
            kwargs["open_with"] = lambda path, _: fsspec.open(
                path, "wb", **(storage_options or {})
            ).open()
        else:
            if storage_options:
                raise ValueError(
                    "storage_options passed with file object or non-fsspec file path"
                )
            path, _, _, _ = get_filepath_or_buffer(path)

        with catch_warnings(record=True):
            self.api.write(
                path,
                df,
                compression=compression,
                write_index=index,
                partition_on=partition_cols,
                **kwargs,
            )
コード例 #31
0
ファイル: _cli.py プロジェクト: nvictus/bow
def par2txt(path, sep, header, index):
    """
    Convert Parquet to CSV text.

    """
    path, _, _, should_close = get_filepath_or_buffer(path)

    f = IterableParquetFile(path)
    n = 0
    for chunk in f:
        print(chunk.to_csv(sep=sep, index=index, header=header))
        n += len(chunk)

    if should_close:
        path.close()
コード例 #32
0
ファイル: _cli.py プロジェクト: nvictus/bow
def info(path, schema):
    """
    Print Parquet file metadata.

    """
    path, _, _, should_close = get_filepath_or_buffer(path)

    f = IterableParquetFile(path)
    if schema:
        print(format_schema(f.schema))
    else:
        print(format_metadata(f.metadata))

    if should_close:
        path.close()
コード例 #33
0
ファイル: _xlrd.py プロジェクト: Itay4/pandas
    def __init__(self, filepath_or_buffer):
        """Reader using xlrd engine.

        Parameters
        ----------
        filepath_or_buffer : string, path object or Workbook
            Object to be parsed.
        """
        err_msg = "Install xlrd >= 1.0.0 for Excel support"

        try:
            import xlrd
        except ImportError:
            raise ImportError(err_msg)
        else:
            if xlrd.__VERSION__ < LooseVersion("1.0.0"):
                raise ImportError(err_msg +
                                  ". Current version " + xlrd.__VERSION__)

        from pandas.io.excel._base import ExcelFile
        # If filepath_or_buffer is a url, want to keep the data as bytes so
        # can't pass to get_filepath_or_buffer()
        if _is_url(filepath_or_buffer):
            filepath_or_buffer = _urlopen(filepath_or_buffer)
        elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
                filepath_or_buffer)

        if isinstance(filepath_or_buffer, xlrd.Book):
            self.book = filepath_or_buffer
        elif hasattr(filepath_or_buffer, "read"):
            # N.B. xlrd.Book has a read attribute too
            if hasattr(filepath_or_buffer, 'seek'):
                try:
                    # GH 19779
                    filepath_or_buffer.seek(0)
                except UnsupportedOperation:
                    # HTTPResponse does not support seek()
                    # GH 20434
                    pass

            data = filepath_or_buffer.read()
            self.book = xlrd.open_workbook(file_contents=data)
        elif isinstance(filepath_or_buffer, compat.string_types):
            self.book = xlrd.open_workbook(filepath_or_buffer)
        else:
            raise ValueError('Must explicitly set engine if not passing in'
                             ' buffer or path for io.')
コード例 #34
0
    def __init__(self, filepath_or_buffer):
        """Reader using xlrd engine.

        Parameters
        ----------
        filepath_or_buffer : string, path object or Workbook
            Object to be parsed.
        """
        err_msg = "Install xlrd >= 1.0.0 for Excel support"

        try:
            import xlrd
        except ImportError:
            raise ImportError(err_msg)
        else:
            if xlrd.__VERSION__ < LooseVersion("1.0.0"):
                raise ImportError(err_msg +
                                  ". Current version " + xlrd.__VERSION__)

        from pandas.io.excel._base import ExcelFile
        # If filepath_or_buffer is a url, want to keep the data as bytes so
        # can't pass to get_filepath_or_buffer()
        if _is_url(filepath_or_buffer):
            filepath_or_buffer = urlopen(filepath_or_buffer)
        elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
                filepath_or_buffer)

        if isinstance(filepath_or_buffer, xlrd.Book):
            self.book = filepath_or_buffer
        elif hasattr(filepath_or_buffer, "read"):
            # N.B. xlrd.Book has a read attribute too
            if hasattr(filepath_or_buffer, 'seek'):
                try:
                    # GH 19779
                    filepath_or_buffer.seek(0)
                except UnsupportedOperation:
                    # HTTPResponse does not support seek()
                    # GH 20434
                    pass

            data = filepath_or_buffer.read()
            self.book = xlrd.open_workbook(file_contents=data)
        elif isinstance(filepath_or_buffer, compat.string_types):
            self.book = xlrd.open_workbook(filepath_or_buffer)
        else:
            raise ValueError('Must explicitly set engine if not passing in'
                             ' buffer or path for io.')
コード例 #35
0
def open_filepath_or_buffer(f, open_flags="r", compression=None):
    """Use pandas IO functions to return a handle from a filepath
    or buffer.

    Parameters
    ----------
    f : str or buffer
        filepath or buffer to open
    open_flags : str, optional
        mode to open file
    compression : str, optional
        compression arg passed to pandas functions

    Returns
    -------
    f : file-like
        A file-like object
    handles : list of file-like
        A list of file-like objects opened. Seems mostly relevant for zipped archives.
    close : bool
        A flag indicating whether the caller should close the file object when done

    """
    if not pandas:
        raise Exception("Please install pandas to use this function")

    res = get_filepath_or_buffer(f, compression=compression)
    # HACK: handle multiple pandas versions
    try:
        f, _, compression, should_close = res
    except TypeError:
        f = res.filepath_or_buffer
        compression = res.compression
        should_close = res.should_close

    close = False or should_close
    if isinstance(f, str):
        close = True

    res = get_handle(f, open_flags, compression=compression)
    # HACK: handle multiple pandas versions
    try:
        f, handles = res
    except TypeError:
        f = res.handle
        handles = res.created_handles

    return f, handles, close
コード例 #36
0
    def from_url(cls, url):
        """
        Alternate constructor to create a GeoDataFrame from a GeoJSON file online.

        Example:
            df = geopandas.GeoDataFrame.from_url('https://raw.githubusercontent.com/geopandas/geopandas/master/examples/null_geom.geojson')

        Inspired by pandas.read_json().

        """
        raw = get_filepath_or_buffer(url)[0]
        data = raw.read()
        if isinstance(data, bytes):
            data = data.decode('utf-8')
        geojson = json.loads(data)
        return GeoDataFrame.from_features(geojson['features'])
コード例 #37
0
ファイル: stata.py プロジェクト: pombredanne/pandas
    def __init__(self, path_or_buf, encoding="cp1252"):
        super(StataReader, self).__init__(encoding)
        self.col_sizes = ()
        self._has_string_data = False
        self._missing_values = False
        self._data_read = False
        self._value_labels_read = False
        if isinstance(path_or_buf, str):
            path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding=self._default_encoding)

        if isinstance(path_or_buf, (str, compat.text_type, bytes)):
            self.path_or_buf = open(path_or_buf, "rb")
        else:
            self.path_or_buf = path_or_buf

        self._read_header()
コード例 #38
0
ファイル: parquet.py プロジェクト: ynorouzz/pandas
    def read(self, path, columns=None, **kwargs):
        fs = get_fs_for_path(path)
        should_close = None
        # Avoid calling get_filepath_or_buffer for s3/gcs URLs since
        # since it returns an S3File which doesn't support dir reads in arrow
        if not fs:
            path, _, _, should_close = get_filepath_or_buffer(path)

        kwargs["use_pandas_metadata"] = True
        result = self.api.parquet.read_table(
            path, columns=columns, filesystem=fs, **kwargs
        ).to_pandas()
        if should_close:
            path.close()

        return result
コード例 #39
0
ファイル: parser.py プロジェクト: oreh/gseapy
def gsea_gmt_parser(gmt, min_size = 3, max_size = 5000, gene_list=None):
    """Parse gene_sets.gmt(gene set database) file. 
    
    :param gmt: the gene_sets.gmt file where loacated inside edb folder.
    :param min_size: Minimum allowed number of genes from gene set also the data set. Default: 3. 
    :param max_size: Maximum allowed number of genes from gene set also the data set. Default: 5000.
    :param gene_list: Used for filtering gene set. Only used this argument for :func:`run` method.
    :return: Return a new filtered gene set database dictionary. 

    **DO NOT** filter gene sets, when use :func:`replot`. Because ``GSEA`` Desktop have already
    do this for you.
            
    """
 
    file_or_buffer, encode, compression = get_filepath_or_buffer(gmt)
    genesets_dict = { line.rstrip("\n").split("\t")[0]:  
                      line.rstrip("\n").split("\t")[2:] 
                      for line in file_or_buffer.readlines()}    
    #filtering dict
    if sys.version_info[0] == 3 :
        genesets_filter =  {k: v for k, v in genesets_dict.items() if len(v) >= min_size and len(v) <= max_size}
    elif sys.version_info[0] == 2:
        genesets_filter =  {k: v for k, v in genesets_dict.iteritems() if len(v) >= min_size and len(v) <= max_size}
    else:
        print("System failure. Please Provide correct input files")
        sys.exit(1)    
    if gene_list is not None:
        subsets = sorted(genesets_filter.keys())             
        for subset in subsets:            
            tag_indicator = in1d(unique(gene_list), genesets_filter.get(subset), assume_unique=True)
            tag_len = sum(tag_indicator)      
            if tag_len <= min_size or tag_len >= max_size:                    
                del genesets_filter[subset]
            else:
                continue
    #some_dict = {key: value for key, value in some_dict.items() if value != value_to_remove}
    #use np.intersect1d() may be faster???    
    filsets_num = len(genesets_dict) - len(genesets_filter)
    print("{a} gene_sets have been filtered out when max_size={b} and min_size={c}".format(a=filsets_num,b=max_size,c=min_size))
    print("{} gene_sets used for further calculating".format(len(genesets_filter)))
    
    if filsets_num == len(genesets_dict):
        print("No gene sets passed throught filtering condition!!!, try new paramters again!\n" +\
              "Note: Gene names for gseapy is case sensitive." )
        sys.exit(1)
    else:
        return genesets_filter
コード例 #40
0
    def read(self, path, columns=None, **kwargs):
        if is_s3_url(path):
            from pandas.io.s3 import get_file_and_filesystem

            # When path is s3:// an S3File is returned.
            # We need to retain the original path(str) while also
            # pass the S3File().open function to fsatparquet impl.
            s3, filesystem = get_file_and_filesystem(path)
            try:
                parquet_file = self.api.ParquetFile(path, open_with=filesystem.open)
            finally:
                s3.close()
        else:
            path, _, _, _ = get_filepath_or_buffer(path)
            parquet_file = self.api.ParquetFile(path)

        return parquet_file.to_pandas(columns=columns, **kwargs)
コード例 #41
0
def read_orc(
    path: FilePathOrBuffer,
    columns: Optional[List[str]] = None,
    **kwargs,
) -> "DataFrame":
    """
    Load an ORC object from the file path, returning a DataFrame.

    .. versionadded:: 1.0.0

    Parameters
    ----------
    path : str, path object or file-like object
        Any valid string path is acceptable. The string could be a URL. Valid
        URL schemes include http, ftp, s3, and file. For file URLs, a host is
        expected. A local file could be:
        ``file://localhost/path/to/table.orc``.

        If you want to pass in a path object, pandas accepts any
        ``os.PathLike``.

        By file-like object, we refer to objects with a ``read()`` method,
        such as a file handler (e.g. via builtin ``open`` function)
        or ``StringIO``.
    columns : list, default None
        If not None, only these columns will be read from the file.
    **kwargs
        Any additional kwargs are passed to pyarrow.

    Returns
    -------
    DataFrame
    """

    # we require a newer version of pyarrow than we support for parquet
    import pyarrow

    if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0":
        raise ImportError("pyarrow must be >= 0.13.0 for read_orc")

    import pyarrow.orc

    path, _, _, _ = get_filepath_or_buffer(path)
    orc_file = pyarrow.orc.ORCFile(path)
    result = orc_file.read(columns=columns, **kwargs).to_pandas()
    return result
コード例 #42
0
    def __init__(self, path_or_buf, encoding='cp1252'):
        super(StataReader, self).__init__(encoding)
        self.col_sizes = ()
        self._has_string_data = False
        self._missing_values = False
        self._data_read = False
        self._value_labels_read = False
        if isinstance(path_or_buf, str):
            path_or_buf, encoding = get_filepath_or_buffer(path_or_buf,
                                                           encoding='cp1252')

        if isinstance(path_or_buf, (str, compat.text_type, bytes)):
            self.path_or_buf = open(path_or_buf, 'rb')
        else:
            self.path_or_buf = path_or_buf

        self._read_header()
コード例 #43
0
ファイル: __init__.py プロジェクト: caladov/pandas-datareader
def get_filepath_or_buffer(filepath_or_buffer,
                           encoding=None,
                           compression=None):

    # Dictionaries are no longer considered valid inputs
    # for "get_filepath_or_buffer" starting in pandas >= 0.20.0
    if isinstance(filepath_or_buffer, dict):
        return filepath_or_buffer, encoding, compression
    try:
        tmp = com._get_filepath_or_buffer(filepath_or_buffer,
                                          encoding=encoding,
                                          compression=None)
        return tmp.filepath_or_buffer, tmp.encoding, tmp.compression
    except AttributeError:
        tmp = com.get_filepath_or_buffer(filepath_or_buffer,
                                         encoding=encoding,
                                         compression=None)
        return tmp
コード例 #44
0
ファイル: stata.py プロジェクト: AjayRamanathan/pandas
    def __init__(self, path_or_buf, encoding=None):
        super(StataReader, self).__init__(encoding)
        self.col_sizes = ()
        self._has_string_data = False
        self._missing_values = False
        self._data_read = False
        self._value_labels_read = False
        if isinstance(path_or_buf, str):
            path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding='cp1252')
            if encoding is not None:
                self._encoding = encoding

        if type(path_or_buf) is str:
            self.path_or_buf = open(path_or_buf, 'rb')
        else:
            self.path_or_buf = path_or_buf

        self._read_header()
コード例 #45
0
ファイル: stata.py プロジェクト: mantinband/hq-trivia-bot
    def __init__(self, path_or_buf, encoding=None):
        super(StataReader, self).__init__(encoding)
        self.col_sizes = ()
        self._has_string_data = False
        self._missing_values = False
        self._data_read = False
        self._value_labels_read = False
        if isinstance(path_or_buf, str):
            path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding='cp1252')
            if encoding is not None:
                self._encoding = encoding

        if type(path_or_buf) is str:
            self.path_or_buf = open(path_or_buf, 'rb')
        else:
            self.path_or_buf = path_or_buf

        self._read_header()
コード例 #46
0
def open_file(filepath_or_buffer, mode="r", encoding=None, compression="infer"):
    if encoding is not None:
        encoding = re.sub("_", "-", encoding).lower()

    compression = _infer_compression(filepath_or_buffer, compression)
    filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer, encoding, compression)

    is_path = isinstance(filepath_or_buffer, str)

    if compression:

        # GZ Compression
        if compression == "gzip":
            if is_path:
                return gzip.open(filepath_or_buffer, mode)
            return gzip.GzipFile(fileobj=filepath_or_buffer)

        # BZ Compression
        elif compression == "bz2":
            if is_path:
                return bz2.BZ2File(filepath_or_buffer, mode)
            return bz2.BZ2File(filepath_or_buffer)

        # ZIP Compression
        elif compression == "zip":
            zip_file = zipfile.ZipFile(filepath_or_buffer)
            zip_names = zip_file.namelist()
            if len(zip_names) == 1:
                return zip_file.open(zip_names.pop())
            if len(zip_names) == 0:
                raise ValueError(f"Zero files found in ZIP file {filepath_or_buffer}")
            else:
                raise ValueError("Multiple files found in ZIP file."
                                 f" Only one file per ZIP: {filepath_or_buffer}")

        # XZ Compression
        elif compression == "xz":
            return lzma.LZMAFile(filepath_or_buffer, mode)

        # Unrecognized Compression
        raise ValueError(f"Unrecognized compression type: {compression}")

    elif is_path:
        return open(filepath_or_buffer, mode, encoding=encoding)
コード例 #47
0
    def __init__(self, filepath_or_buffer):
        # If filepath_or_buffer is a url, load the data into a BytesIO
        if _is_url(filepath_or_buffer):
            filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
        elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer)

        if isinstance(filepath_or_buffer, self._workbook_class):
            self.book = filepath_or_buffer
        elif hasattr(filepath_or_buffer, "read"):
            # N.B. xlrd.Book has a read attribute too
            filepath_or_buffer.seek(0)
            self.book = self.load_workbook(filepath_or_buffer)
        elif isinstance(filepath_or_buffer, str):
            self.book = self.load_workbook(filepath_or_buffer)
        else:
            raise ValueError(
                "Must explicitly set engine if not passing in buffer or path for io."
            )
コード例 #48
0
ファイル: util.py プロジェクト: racheljoyforshaw/pyopendata
def _read_content(path_or_buf):
    filepath_or_buffer, _ = get_filepath_or_buffer(path_or_buf)
    if isinstance(filepath_or_buffer, compat.string_types):
        try:
            exists = os.path.exists(filepath_or_buffer)
        except (TypeError,ValueError):
            exists = False

        if exists:
            with open(filepath_or_buffer, 'r') as fh:
                data = fh.read()
        else:
            data = filepath_or_buffer
    elif hasattr(filepath_or_buffer, 'read'):
        data = filepath_or_buffer.read()
    else:
        data = filepath_or_buffer

    return data
コード例 #49
0
def read_feather(path, columns=None, use_threads: bool = True):
    """
    Load a feather-format object from the file path.

    Parameters
    ----------
    path : str, path object or file-like object
        Any valid string path is acceptable. The string could be a URL. Valid
        URL schemes include http, ftp, s3, and file. For file URLs, a host is
        expected. A local file could be:
        ``file://localhost/path/to/table.feather``.

        If you want to pass in a path object, pandas accepts any
        ``os.PathLike``.

        By file-like object, we refer to objects with a ``read()`` method,
        such as a file handler (e.g. via builtin ``open`` function)
        or ``StringIO``.
    columns : sequence, default None
        If not provided, all columns are read.

        .. versionadded:: 0.24.0
    use_threads : bool, default True
        Whether to parallelize reading using multiple threads.

       .. versionadded:: 0.24.0

    Returns
    -------
    type of object stored in file
    """
    import_optional_dependency("pyarrow")
    from pyarrow import feather

    path, _, _, should_close = get_filepath_or_buffer(path)

    df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads))

    # s3fs only validates the credentials when the file is closed.
    if should_close:
        path.close()

    return df
コード例 #50
0
ファイル: csvs.py プロジェクト: x997/pandas
    def __init__(
        self,
        formatter: "DataFrameFormatter",
        path_or_buf: FilePathOrBuffer[str] = "",
        sep: str = ",",
        cols: Optional[Sequence[Label]] = None,
        index_label: Optional[IndexLabel] = None,
        mode: str = "w",
        encoding: Optional[str] = None,
        errors: str = "strict",
        compression: CompressionOptions = "infer",
        quoting: Optional[int] = None,
        line_terminator="\n",
        chunksize: Optional[int] = None,
        quotechar: Optional[str] = '"',
        date_format: Optional[str] = None,
        doublequote: bool = True,
        escapechar: Optional[str] = None,
        storage_options: StorageOptions = None,
    ):
        self.fmt = formatter

        self.obj = self.fmt.frame

        self.ioargs = get_filepath_or_buffer(
            path_or_buf,
            encoding=encoding,
            compression=compression,
            mode=mode,
            storage_options=storage_options,
        )

        self.sep = sep
        self.index_label = self._initialize_index_label(index_label)
        self.errors = errors
        self.quoting = quoting or csvlib.QUOTE_MINIMAL
        self.quotechar = self._initialize_quotechar(quotechar)
        self.doublequote = doublequote
        self.escapechar = escapechar
        self.line_terminator = line_terminator or os.linesep
        self.date_format = date_format
        self.cols = self._initialize_columns(cols)
        self.chunksize = self._initialize_chunksize(chunksize)
コード例 #51
0
ファイル: parquet.py プロジェクト: bwignall/pandas
    def write(self, df, path, compression='snappy',
              coerce_timestamps='ms', index=None, partition_cols=None,
              **kwargs):
        self.validate_dataframe(df)
        path, _, _, _ = get_filepath_or_buffer(path, mode='wb')

        if index is None:
            from_pandas_kwargs = {}
        else:
            from_pandas_kwargs = {'preserve_index': index}
        table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
        if partition_cols is not None:
            self.api.parquet.write_to_dataset(
                table, path, compression=compression,
                coerce_timestamps=coerce_timestamps,
                partition_cols=partition_cols, **kwargs)
        else:
            self.api.parquet.write_table(
                table, path, compression=compression,
                coerce_timestamps=coerce_timestamps, **kwargs)
コード例 #52
0
ファイル: _base.py プロジェクト: pydata/pandas
    def __init__(self, filepath_or_buffer):
        # If filepath_or_buffer is a url, load the data into a BytesIO
        if _is_url(filepath_or_buffer):
            filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
        elif not isinstance(filepath_or_buffer,
                            (ExcelFile, self._workbook_class)):
            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
                filepath_or_buffer)

        if isinstance(filepath_or_buffer, self._workbook_class):
            self.book = filepath_or_buffer
        elif hasattr(filepath_or_buffer, "read"):
            # N.B. xlrd.Book has a read attribute too
            filepath_or_buffer.seek(0)
            self.book = self.load_workbook(filepath_or_buffer)
        elif isinstance(filepath_or_buffer, str):
            self.book = self.load_workbook(filepath_or_buffer)
        else:
            raise ValueError('Must explicitly set engine if not passing in'
                             ' buffer or path for io.')
コード例 #53
0
def _read_content(path_or_buf):
    """ copied part of internal logic from pandas.io.read_json """
    results = get_filepath_or_buffer(path_or_buf)
    # results length is 3 in pandas 0.17 or later, 2 in 0.16.2 or prior
    filepath_or_buffer = results[0]
    if isinstance(filepath_or_buffer, compat.string_types):
        try:
            exists = os.path.exists(filepath_or_buffer)
        except (TypeError, ValueError):
            exists = False

        if exists:
            with open(filepath_or_buffer, 'r') as fh:
                data = fh.read()
        else:
            data = filepath_or_buffer
    elif hasattr(filepath_or_buffer, 'read'):
        data = filepath_or_buffer.read()
    else:
        data = filepath_or_buffer

    return data
コード例 #54
0
ファイル: test_common.py プロジェクト: TomAugspurger/pandas
 def test_get_filepath_or_buffer_with_buffer(self):
     input_buffer = StringIO()
     filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
         input_buffer)
     assert filepath_or_buffer == input_buffer
     assert not should_close
コード例 #55
0
ファイル: json.py プロジェクト: BrenBarn/pandas
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
              convert_axes=True, convert_dates=True, keep_default_dates=True,
              numpy=False, precise_float=False, date_unit=None):
    """
    Convert a JSON string to pandas object

    Parameters
    ----------
    path_or_buf : a valid JSON string or file-like, default: None
        The string could be a URL. Valid URL schemes include http, ftp, s3, and
        file. For file URLs, a host is expected. For instance, a local file
        could be ``file://localhost/path/to/table.json``

    orient  

        * `Series`

          - default is ``'index'``
          - allowed values are: ``{'split','records','index'}``
          - The Series index must be unique for orient ``'index'``.

        * `DataFrame`

          - default is ``'columns'``
          - allowed values are: {'split','records','index','columns','values'}
          - The DataFrame index must be unique for orients 'index' and
            'columns'.
          - The DataFrame columns must be unique for orients 'index',
            'columns', and 'records'.

        * The format of the JSON string

          - split : dict like
            ``{index -> [index], columns -> [columns], data -> [values]}``
          - records : list like
            ``[{column -> value}, ... , {column -> value}]``
          - index : dict like ``{index -> {column -> value}}``
          - columns : dict like ``{column -> {index -> value}}``
          - values : just the values array

    typ : type of object to recover (series or frame), default 'frame'
    dtype : boolean or dict, default True
        If True, infer dtypes, if a dict of column to dtype, then use those,
        if False, then don't infer dtypes at all, applies only to the data.
    convert_axes : boolean, default True
        Try to convert the axes to the proper dtypes.
    convert_dates : boolean, default True
        List of columns to parse for dates; If True, then try to parse
        datelike columns default is True; a column label is datelike if
        
        * it ends with ``'_at'``,
        
        * it ends with ``'_time'``,
        
        * it begins with ``'timestamp'``,
        
        * it is ``'modified'``, or
        
        * it is ``'date'``

    keep_default_dates : boolean, default True
        If parsing dates, then parse the default datelike columns
    numpy : boolean, default False
        Direct decoding to numpy arrays. Supports numeric data only, but
        non-numeric column and index labels are supported. Note also that the
        JSON ordering MUST be the same for each term if numpy=True.
    precise_float : boolean, default False
        Set to enable usage of higher precision (strtod) function when
        decoding string to double values. Default (False) is to use fast but
        less precise builtin functionality
    date_unit : string, default None
        The timestamp unit to detect if converting dates. The default behaviour
        is to try and detect the correct precision, but if this is not desired
        then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
        milliseconds, microseconds or nanoseconds respectively.

    Returns
    -------
    result : Series or DataFrame
    """

    filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf)
    if isinstance(filepath_or_buffer, compat.string_types):
        try:
            exists = os.path.exists(filepath_or_buffer)

        # if the filepath is too long will raise here
        # 5874
        except (TypeError,ValueError):
            exists = False

        if exists:
            with open(filepath_or_buffer, 'r') as fh:
                json = fh.read()
        else:
            json = filepath_or_buffer
    elif hasattr(filepath_or_buffer, 'read'):
        json = filepath_or_buffer.read()
    else:
        json = filepath_or_buffer

    obj = None
    if typ == 'frame':
        obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
                          keep_default_dates, numpy, precise_float,
                          date_unit).parse()

    if typ == 'series' or obj is None:
        if not isinstance(dtype, bool):
            dtype = dict(data=dtype)
        obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates,
                           keep_default_dates, numpy, precise_float,
                           date_unit).parse()

    return obj
コード例 #56
0
ファイル: parquet.py プロジェクト: Xbar/pandas
 def read(self, path, columns=None, **kwargs):
     path, _, _ = get_filepath_or_buffer(path)
     return self.api.parquet.read_table(path, columns=columns,
                                        **kwargs).to_pandas()
コード例 #57
0
ファイル: test_common.py プロジェクト: AkiraKane/pandas
 def test_get_filepath_or_buffer_with_buffer(self):
     input_buffer = StringIO()
     filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer)
     self.assertEqual(filepath_or_buffer, input_buffer)
コード例 #58
0
ファイル: test_common.py プロジェクト: AkiraKane/pandas
 def test_get_filepath_or_buffer_with_path(self):
     filename = '~/sometest'
     filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename)
     self.assertNotEqual(filepath_or_buffer, filename)
     self.assertTrue(isabs(filepath_or_buffer))
     self.assertEqual(os.path.expanduser(filename), filepath_or_buffer)
コード例 #59
0
ファイル: json.py プロジェクト: AlexisMignon/pandas
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
              convert_axes=True, convert_dates=True, keep_default_dates=True,
              numpy=False, precise_float=False, date_unit=None, encoding=None,
              lines=False):
    """
    Convert a JSON string to pandas object

    Parameters
    ----------
    path_or_buf : a valid JSON string or file-like, default: None
        The string could be a URL. Valid URL schemes include http, ftp, s3, and
        file. For file URLs, a host is expected. For instance, a local file
        could be ``file://localhost/path/to/table.json``

    orient : string,
        Indication of expected JSON string format.
        Compatible JSON strings can be produced by ``to_json()`` with a
        corresponding orient value.
        The set of possible orients is:

        - ``'split'`` : dict like
          ``{index -> [index], columns -> [columns], data -> [values]}``
        - ``'records'`` : list like
          ``[{column -> value}, ... , {column -> value}]``
        - ``'index'`` : dict like ``{index -> {column -> value}}``
        - ``'columns'`` : dict like ``{column -> {index -> value}}``
        - ``'values'`` : just the values array

        The allowed and default values depend on the value
        of the `typ` parameter.

        * when ``typ == 'series'``,

          - allowed orients are ``{'split','records','index'}``
          - default is ``'index'``
          - The Series index must be unique for orient ``'index'``.

        * when ``typ == 'frame'``,

          - allowed orients are ``{'split','records','index',
            'columns','values'}``
          - default is ``'columns'``
          - The DataFrame index must be unique for orients ``'index'`` and
            ``'columns'``.
          - The DataFrame columns must be unique for orients ``'index'``,
            ``'columns'``, and ``'records'``.

    typ : type of object to recover (series or frame), default 'frame'
    dtype : boolean or dict, default True
        If True, infer dtypes, if a dict of column to dtype, then use those,
        if False, then don't infer dtypes at all, applies only to the data.
    convert_axes : boolean, default True
        Try to convert the axes to the proper dtypes.
    convert_dates : boolean, default True
        List of columns to parse for dates; If True, then try to parse
        datelike columns default is True; a column label is datelike if

        * it ends with ``'_at'``,

        * it ends with ``'_time'``,

        * it begins with ``'timestamp'``,

        * it is ``'modified'``, or

        * it is ``'date'``

    keep_default_dates : boolean, default True
        If parsing dates, then parse the default datelike columns
    numpy : boolean, default False
        Direct decoding to numpy arrays. Supports numeric data only, but
        non-numeric column and index labels are supported. Note also that the
        JSON ordering MUST be the same for each term if numpy=True.
    precise_float : boolean, default False
        Set to enable usage of higher precision (strtod) function when
        decoding string to double values. Default (False) is to use fast but
        less precise builtin functionality
    date_unit : string, default None
        The timestamp unit to detect if converting dates. The default behaviour
        is to try and detect the correct precision, but if this is not desired
        then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
        milliseconds, microseconds or nanoseconds respectively.
    lines : boolean, default False
        Read the file as a json object per line.

        .. versionadded:: 0.19.0

    encoding : str, default is 'utf-8'
        The encoding to use to decode py3 bytes.

        .. versionadded:: 0.19.0

    Returns
    -------
    result : Series or DataFrame, depending on the value of `typ`.

    See Also
    --------
    DataFrame.to_json

    Examples
    --------

    >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
    ...                   index=['row 1', 'row 2'],
    ...                   columns=['col 1', 'col 2'])

    Encoding/decoding a Dataframe using ``'split'`` formatted JSON:

    >>> df.to_json(orient='split')
    '{"columns":["col 1","col 2"],
      "index":["row 1","row 2"],
      "data":[["a","b"],["c","d"]]}'
    >>> pd.read_json(_, orient='split')
          col 1 col 2
    row 1     a     b
    row 2     c     d

    Encoding/decoding a Dataframe using ``'index'`` formatted JSON:

    >>> df.to_json(orient='index')
    '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
    >>> pd.read_json(_, orient='index')
          col 1 col 2
    row 1     a     b
    row 2     c     d

    Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
    Note that index labels are not preserved with this encoding.

    >>> df.to_json(orient='records')
    '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
    >>> pd.read_json(_, orient='records')
      col 1 col 2
    0     a     b
    1     c     d
    """

    filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
                                                      encoding=encoding)
    if isinstance(filepath_or_buffer, compat.string_types):
        try:
            exists = os.path.exists(filepath_or_buffer)

        # if the filepath is too long will raise here
        # 5874
        except (TypeError, ValueError):
            exists = False

        if exists:
            fh, handles = _get_handle(filepath_or_buffer, 'r',
                                      encoding=encoding)
            json = fh.read()
            fh.close()
        else:
            json = filepath_or_buffer
    elif hasattr(filepath_or_buffer, 'read'):
        json = filepath_or_buffer.read()
    else:
        json = filepath_or_buffer

    if lines:
        # If given a json lines file, we break the string into lines, add
        # commas and put it in a json list to make a valid json object.
        lines = list(StringIO(json.strip()))
        json = u'[' + u','.join(lines) + u']'

    obj = None
    if typ == 'frame':
        obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
                          keep_default_dates, numpy, precise_float,
                          date_unit).parse()

    if typ == 'series' or obj is None:
        if not isinstance(dtype, bool):
            dtype = dict(data=dtype)
        obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates,
                           keep_default_dates, numpy, precise_float,
                           date_unit).parse()

    return obj