Esempio n. 1
0
Examples
--------
>>> import cudf
>>> df = cudf.read_avro(filename)
>>> df
  num1                datetime text
0  123 2018-11-13T12:00:00.000 5451
1  456 2018-11-14T12:35:01.000 5784
2  789 2018-11-15T18:02:59.000 6117

See Also
--------
cudf.io.csv.read_csv
cudf.io.json.read_json
""".format(remote_data_sources=_docstring_remote_sources)
doc_read_avro = docfmt_partial(docstring=_docstring_read_avro)

_docstring_read_parquet_metadata = """
Read a Parquet file's metadata and schema

Parameters
----------
path : string or path object
    Path of file to be read

Returns
-------
Total number of rows
Number of row groups
List of column names
Esempio n. 2
0
--------
>>> import cudf
>>> num_rows, num_row_groups, names = cudf.read_parquet_metadata(filename)
>>> df = [cudf.read_parquet(fname, row_group=i) for i in range(row_groups)]
>>> df = cudf.concat(df)
>>> df
  num1                datetime text
0  123 2018-11-13T12:00:00.000 5451
1  456 2018-11-14T12:35:01.000 5784
2  789 2018-11-15T18:02:59.000 6117

See Also
--------
cudf.io.parquet.read_parquet
"""
doc_read_parquet_metadata = docfmt_partial(
    docstring=_docstring_read_parquet_metadata)

_docstring_read_parquet = """
Read a Parquet file into DataFrame

Parameters
----------
path : string or path object
    Path of file to be read
engine : { 'cudf', 'pyarrow' }, default 'cudf'
    Parser engine to use.
columns : list, default None
    If not None, only these columns will be read.
row_group : int, default None
    If not None, only the row group with the specified index will be read.
skip_rows : int, default None
Esempio n. 3
0
    Then in user code:

        >>> gs = gd.Series(list(range(0, 50)))
        >>> gs.odd[1]
        1
        >>> gs.odd[2]
        3
        >>> gs.odd[3]
        5

"""


doc_register_dataframe_accessor = docfmt_partial(
    docstring=_docstring_register_accessor.format(
        klass="DataFrame", example=_dataframe_example
    )
)

doc_register_index_accessor = docfmt_partial(
    docstring=_docstring_register_accessor.format(
        klass="Index", example=_index_example
    )
)

doc_register_series_accessor = docfmt_partial(
    docstring=_docstring_register_accessor.format(
        klass="Series", example=_series_example
    )
)
Esempio n. 4
0
chunks : int or Series-like
            If it is an ``int``, it is the chunksize.
            If it is an array, it contains integer offset for the start of
            each chunk.  The span of a chunk for chunk i-th is
            ``data[chunks[i] : chunks[i + 1]]`` for any
            ``i + 1 < chunks.size``; or, ``data[chunks[i]:]`` for the
            ``i == len(chunks) - 1``.
tpb : int; optional
    It is the thread-per-block for the underlying kernel.
    The default uses 1 thread to emulate serial execution for
    each chunk.  It is a good starting point but inefficient.
    Its maximum possible value is limited by the available CUDA GPU
    resources.
"""

doc_apply = docfmt_partial(params=_doc_applyparams)
doc_applychunks = docfmt_partial(params=_doc_applyparams,
                                 params_chunks=_doc_applychunkparams)


@doc_apply()
def apply_rows(df, func, incols, outcols, kwargs, cache_key):
    """Row-wise transformation

    Parameters
    ----------
    df : DataFrame
        The source dataframe.
    {params}

    """
Esempio n. 5
0
0       10   hello
1       20  rapids
2       30      ai
>>> pandavro.to_avro("data.avro", pandas_df)
>>> cudf.read_avro("data.avro")
   numbers    text
0       10   hello
1       20  rapids
2       30      ai

See Also
--------
cudf.io.csv.read_csv
cudf.io.json.read_json
""".format(remote_data_sources=_docstring_remote_sources)
doc_read_avro = docfmt_partial(docstring=_docstring_read_avro)

_docstring_read_parquet_metadata = """
Read a Parquet file's metadata and schema

Parameters
----------
path : string or path object
    Path of file to be read

Returns
-------
Total number of rows
Number of row groups
List of column names
Esempio n. 6
0
Examples
--------
>>> import cudf
>>> df = cudf.read_parquet(filename)
>>> df
  num1                datetime text
0  123 2018-11-13T12:00:00.000 5451
1  456 2018-11-14T12:35:01.000 5784
2  789 2018-11-15T18:02:59.000 6117

See Also
--------
cudf.io.parquet.to_parquet
cudf.io.orc.read_orc
"""
doc_read_parquet = docfmt_partial(docstring=_docstring_read_parquet)

_docstring_to_parquet = """
Write a DataFrame to the parquet format.

Parameters
----------
path : str
    File path or Root Directory path. Will be used as Root Directory path
    while writing a partitioned dataset.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
    Name of the compression to use. Use ``None`` for no compression.
index : bool, default None
    If ``True``, include the dataframe's index(es) in the file output. If
    ``False``, they will not be written to the file. If ``None``, the
    engine's default behavior will be used.