Exemple #1
0
 def __init__(self, fn, verify=False, open_with=default_open,
              root=False, sep=None):
     if isinstance(fn, (tuple, list)):
         basepath, fmd = metadata_from_many(fn, verify_schema=verify,
                                            open_with=open_with, root=root)
         if basepath:
             self.fn = join_path(basepath, '_metadata')  # effective file
         else:
             self.fn = '_metadata'
         self.fmd = fmd
         self._set_attrs()
     elif hasattr(fn, 'read'):
         # file-like
         self._parse_header(fn, verify)
         if self.file_scheme not in ['simple', 'empty']:
             raise ValueError('Cannot use file-like input '
                              'with multi-file data')
         open_with = lambda *args, **kwargs: fn
         self.fn = None
     else:
         try:
             fn2 = join_path(fn, '_metadata')
             self.fn = fn2
             with open_with(fn2, 'rb') as f:
                 self._parse_header(f, verify)
             fn = fn2
         except (IOError, OSError):
             self.fn = join_path(fn)
             with open_with(fn, 'rb') as f:
                 self._parse_header(f, verify)
     self.open = open_with
     self.sep = sep
Exemple #2
0
def test_parents():
    assert join_path("test", "../../..") == "../.."

    with pytest.raises(Exception):
        join_path("/test", "../../..")
    with pytest.raises(Exception):
        join_path("/test", "../..")
Exemple #3
0
 def __init__(self,
              fn,
              verify=False,
              open_with=default_open,
              root=False,
              sep=None):
     if isinstance(fn, (tuple, list)):
         basepath, fmd = metadata_from_many(fn,
                                            verify_schema=verify,
                                            open_with=open_with,
                                            root=root)
         if basepath:
             self.fn = join_path(basepath, '_metadata')  # effective file
         else:
             self.fn = '_metadata'
         self.fmd = fmd
         self._set_attrs()
     else:
         try:
             fn2 = join_path(fn, '_metadata')
             self.fn = fn2
             with open_with(fn2, 'rb') as f:
                 self._parse_header(f, verify)
             fn = fn2
         except (IOError, OSError):
             self.fn = join_path(fn)
             with open_with(fn, 'rb') as f:
                 self._parse_header(f, verify)
     self.open = open_with
     self.sep = sep
Exemple #4
0
def test_parents():
    assert join_path("test", "../../..") == "../.."

    with pytest.raises(Exception):
        join_path("/test", "../../..")
    with pytest.raises(Exception):
        join_path("/test", "../..")
Exemple #5
0
def test_attributes(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2])
    pf = ParquetFile(fn)
    assert pf.columns == ['x', 'y', 'z']
    assert len(pf.row_groups) == 2
    assert pf.count == 4
    assert join_path(fn) == pf.info['name']
    assert join_path(fn) in str(pf)
    for col in df:
        assert pf.dtypes[col] == df.dtypes[col]
Exemple #6
0
def test_attributes(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2])
    pf = ParquetFile(fn)
    assert pf.columns == ['x', 'y', 'z']
    assert len(pf.row_groups) == 2
    assert pf.count == 4
    assert join_path(fn) == pf.info['name']
    assert join_path(fn) in str(pf)
    for col in df:
        assert pf.dtypes[col] == df.dtypes[col]
Exemple #7
0
def time_text():
    with tmpdir() as tempdir:
        result = {}
        fn = join_path(tempdir, 'temp.parq')
        n = 1000000
        d = pd.DataFrame({
            'a': np.random.choice(['hi', 'you', 'people'], size=n),
            'b': np.random.choice([b'hi', b'you', b'people'], size=n)})

        for col in d.columns:
            for fixed in [None, 6]:
                df = d[[col]]
                if isinstance(df.iloc[0, 0], bytes):
                    t = "bytes"
                else:
                    t = 'utf8'
                write(fn, df)
                with measure('%s: write, fixed: %s' % (t, fixed), result):
                    write(fn, df, has_nulls=False, write_index=False,
                          fixed_text={col: fixed}, object_encoding=t)

                pf = ParquetFile(fn)
                pf.to_pandas()  # warm-up

                with measure('%s: read, fixed: %s' % (t, fixed), result):
                    pf.to_pandas()
        return result
Exemple #8
0
def time_text():
    with tmpdir() as tempdir:
        result = {}
        fn = join_path(tempdir, 'temp.parq')
        n = 1000000
        d = pd.DataFrame({
            'a':
            np.random.choice(['hi', 'you', 'people'], size=n),
            'b':
            np.random.choice([b'hi', b'you', b'people'], size=n)
        })

        for col in d.columns:
            for fixed in [None, 6]:
                df = d[[col]]
                if isinstance(df.iloc[0, 0], bytes):
                    t = "bytes"
                else:
                    t = 'utf8'
                write(fn, df)
                with measure('%s: write, fixed: %s' % (t, fixed), result):
                    write(fn,
                          df,
                          has_nulls=False,
                          write_index=False,
                          fixed_text={col: fixed},
                          object_encoding=t)

                pf = ParquetFile(fn)
                pf.to_pandas()  # warm-up

                with measure('%s: read, fixed: %s' % (t, fixed), result):
                    pf.to_pandas()
        return result
Exemple #9
0
def partition_on_columns(data,
                         columns,
                         root_path,
                         partname,
                         fmd,
                         compression,
                         open_with,
                         mkdirs,
                         with_field=True):
    """
    Split each row-group by the given columns

    Each combination of column values (determined by pandas groupby) will
    be written in structured directories.
    """
    gb = data.groupby(columns)
    remaining = list(data)
    for column in columns:
        remaining.remove(column)
    if not remaining:
        raise ValueError("Cannot include all columns in partition_on")
    rgs = []
    for key, group in zip(sorted(gb.indices), sorted(gb)):
        if group[1].empty:
            continue
        df = group[1][remaining]
        if not isinstance(key, tuple):
            key = (key, )
        if with_field:
            path = join_path(*("%s=%s" % (name, val)
                               for name, val in zip(columns, key)))
        else:
            path = join_path(*("%s" % val for val in key))
        relname = join_path(path, partname)
        mkdirs(join_path(root_path, path))
        fullname = join_path(root_path, path, partname)
        with open_with(fullname, 'wb') as f2:
            rg = make_part_file(f2,
                                df,
                                fmd.schema,
                                compression=compression,
                                fmd=fmd)
        if rg is not None:
            for chunk in rg.columns:
                chunk.file_path = relname
            rgs.append(rg)
    return rgs
Exemple #10
0
 def row_group_filename(self, rg):
     if rg.columns and rg.columns[0].file_path:
         base = self.fn.replace('_metadata', '').rstrip('/')
         if base:
             return join_path(base, rg.columns[0].file_path)
         else:
             return rg.columns[0].file_path
     else:
         return self.fn
Exemple #11
0
 def row_group_filename(self, rg):
     if rg.columns and rg.columns[0].file_path:
         base = re.sub(r'_metadata(/)?$', '', self.fn).rstrip('/')
         if base:
             return join_path(base, rg.columns[0].file_path)
         else:
             return rg.columns[0].file_path
     else:
         return self.fn
Exemple #12
0
def test_abs_and_rel_paths():
    assert join_path('/', 'this/is/a/test/') == '/this/is/a/test'
    assert join_path('.', 'this/is/a/test/') == 'this/is/a/test'
    assert join_path('', 'this/is/a/test/') == 'this/is/a/test'
    assert join_path('/test', '.') == '/test'
    assert join_path('/test', '..', 'this') == '/this'
    assert join_path('/test', '../this') == '/this'
Exemple #13
0
def test_abs_and_rel_paths():
    assert join_path('/', 'this/is/a/test/') == '/this/is/a/test'
    assert join_path('.', 'this/is/a/test/') == 'this/is/a/test'
    assert join_path('', 'this/is/a/test/') == 'this/is/a/test'
    assert join_path('/test', '.') == '/test'
    assert join_path('/test', '..', 'this') == '/this'
    assert join_path('/test', '../this') == '/this'
Exemple #14
0
def partition_on_columns(data, columns, root_path, partname, fmd,
                         compression, open_with, mkdirs, with_field=True):
    """
    Split each row-group by the given columns

    Each combination of column values (determined by pandas groupby) will
    be written in structured directories.
    """
    gb = data.groupby(columns)
    remaining = list(data)
    for column in columns:
        remaining.remove(column)
    if not remaining:
        raise ValueError("Cannot include all columns in partition_on")
    rgs = []
    for key, group in zip(sorted(gb.indices), sorted(gb)):
        if group[1].empty:
            continue
        df = group[1][remaining]
        if not isinstance(key, tuple):
            key = (key,)
        if with_field:
            path = join_path(*(
                "%s=%s" % (name, val)
                for name, val in zip(columns, key)
            ))
        else:
            path = join_path(*("%s" % val for val in key))
        relname = join_path(path, partname)
        mkdirs(join_path(root_path, path))
        fullname = join_path(root_path, path, partname)
        with open_with(fullname, 'wb') as f2:
            rg = make_part_file(f2, df, fmd.schema,
                                compression=compression, fmd=fmd)
        if rg is not None:
            for chunk in rg.columns:
                chunk.file_path = relname
            rgs.append(rg)
    return rgs
Exemple #15
0
def merge(file_list, verify_schema=True, open_with=default_open, root=False):
    """
    Create a logical data-set out of multiple parquet files.

    The files referenced in file_list must either be in the same directory,
    or at the same level within a structured directory, where the directories
    give partitioning information. The schemas of the files should also be
    consistent.

    Parameters
    ----------
    file_list: list of paths or ParquetFile instances
    verify_schema: bool (True)
        If True, will first check that all the schemas in the input files are
        identical.
    open_with: func
        Used for opening a file for writing as f(path, mode). If input list
        is ParquetFile instances, will be inferred from the first one of these.
    root: str
        If passing a list of files, the top directory of the data-set may
        be ambiguous for partitioning where the upmost field has only one
        value. Use this to specify the data'set root directory, if required.

    Returns
    -------
    ParquetFile instance corresponding to the merged data.
    """
    basepath, fmd = metadata_from_many(file_list,
                                       verify_schema,
                                       open_with,
                                       root=root)

    out_file = join_path(basepath, '_metadata')
    write_common_metadata(out_file, fmd, open_with, no_row_groups=False)
    out = api.ParquetFile(out_file, open_with=open_with)

    out_file = join_path(basepath, '_common_metadata')
    write_common_metadata(out_file, fmd, open_with)
    return out
Exemple #16
0
def merge(file_list, verify_schema=True, open_with=default_open,
          root=False):
    """
    Create a logical data-set out of multiple parquet files.

    The files referenced in file_list must either be in the same directory,
    or at the same level within a structured directory, where the directories
    give partitioning information. The schemas of the files should also be
    consistent.

    Parameters
    ----------
    file_list: list of paths or ParquetFile instances
    verify_schema: bool (True)
        If True, will first check that all the schemas in the input files are
        identical.
    open_with: func
        Used for opening a file for writing as f(path, mode). If input list
        is ParquetFile instances, will be inferred from the first one of these.
    root: str
        If passing a list of files, the top directory of the data-set may
        be ambiguous for partitioning where the upmost field has only one
        value. Use this to specify the data'set root directory, if required.

    Returns
    -------
    ParquetFile instance corresponding to the merged data.
    """
    basepath, fmd = metadata_from_many(file_list, verify_schema, open_with,
                                       root=root)

    out_file = join_path(basepath, '_metadata')
    write_common_metadata(out_file, fmd, open_with, no_row_groups=False)
    out = api.ParquetFile(out_file, open_with=open_with)

    out_file = join_path(basepath, '_common_metadata')
    write_common_metadata(out_file, fmd, open_with)
    return out
Exemple #17
0
def test_single_upper_directory(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5], 'y': ['aa'] * 4})
    write(tempdir, df, file_scheme='hive', partition_on='y')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert (out.y == 'aa').all()

    os.unlink(os.path.join(tempdir, '_metadata'))
    os.unlink(os.path.join(tempdir, '_common_metadata'))
    import glob
    flist = list(sorted(glob.glob(os.path.join(tempdir, '*/*'))))
    pf = ParquetFile(flist, root=tempdir)
    assert pf.fn == join_path(os.path.join(tempdir, '_metadata'))
    out = pf.to_pandas()
    assert (out.y == 'aa').all()
Exemple #18
0
def test_single_upper_directory(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5], 'y': ['aa'] * 4})
    write(tempdir, df, file_scheme='hive', partition_on='y')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert (out.y == 'aa').all()

    os.unlink(os.path.join(tempdir, '_metadata'))
    os.unlink(os.path.join(tempdir, '_common_metadata'))
    import glob
    flist = list(sorted(glob.glob(os.path.join(tempdir, '*/*'))))
    pf = ParquetFile(flist, root=tempdir)
    assert pf.fn == join_path(os.path.join(tempdir, '_metadata'))
    out = pf.to_pandas()
    assert (out.y == 'aa').all()
Exemple #19
0
def _read_fp_multifile(fs,
                       fs_token,
                       paths,
                       columns=None,
                       categories=None,
                       index=None):
    """Read dataset with fastparquet by assuming metadata from first file"""
    from fastparquet import ParquetFile
    from fastparquet.util import analyse_paths, get_file_scheme, join_path

    base, fns = analyse_paths(paths)
    parsed_paths = [join_path(p) for p in paths]
    scheme = get_file_scheme(fns)
    pf = ParquetFile(paths[0], open_with=fs.open)
    pf.file_scheme = scheme
    pf.cats = _paths_to_cats(fns, scheme)
    (
        meta,
        _,
        index_name,
        out_type,
        all_columns,
        index_names,
        storage_name_mapping,
    ) = _pf_validation(pf, columns, index, categories, [])
    name = "read-parquet-" + tokenize(fs_token, paths, all_columns, categories)
    dsk = {(name, i): (
        _read_pf_simple,
        fs,
        path,
        base,
        index_names,
        all_columns,
        out_type == Series,
        categories,
        pf.cats,
        pf.file_scheme,
        storage_name_mapping,
    )
           for i, path in enumerate(parsed_paths)}
    divisions = (None, ) * (len(paths) + 1)
    return out_type(dsk, name, meta, divisions)
Exemple #20
0
def _read_fp_multifile(fs, fs_token, paths, columns=None,
                       categories=None, index=None):
    """Read dataset with fastparquet by assuming metadata from first file"""
    from fastparquet import ParquetFile
    from fastparquet.util import analyse_paths, get_file_scheme, join_path
    base, fns = analyse_paths(paths)
    parsed_paths = [join_path(p) for p in paths]
    scheme = get_file_scheme(fns)
    pf = ParquetFile(paths[0], open_with=fs.open)
    pf.file_scheme = scheme
    pf.cats = _paths_to_cats(fns, scheme)
    (meta, _, index_name, out_type, all_columns, index_names,
     storage_name_mapping) = _pf_validation(
        pf, columns, index, categories, [])
    name = 'read-parquet-' + tokenize(fs_token, paths, all_columns,
                                      categories)
    dsk = {(name, i): (_read_pf_simple, fs, path, base,
                       index_names, all_columns, out_type == Series,
                       categories, pf.cats,
                       pf.file_scheme, storage_name_mapping)
           for i, path in enumerate(parsed_paths)}
    divisions = (None, ) * (len(paths) + 1)
    return out_type(dsk, name, meta, divisions)
Exemple #21
0
 def __init__(self,
              fn,
              verify=False,
              open_with=default_open,
              root=False,
              sep=None,
              fs=None,
              pandas_nulls=True):
     self.pandas_nulls = pandas_nulls
     if open_with is default_open and fs is None:
         fs = fsspec.filesystem("file")
     elif fs is not None:
         open_with = fs.open
     else:
         fs = getattr(open_with, "__self__", None)
     if isinstance(fn, (tuple, list)):
         basepath, fmd = metadata_from_many(fn,
                                            verify_schema=verify,
                                            open_with=open_with,
                                            root=root,
                                            fs=fs)
         if basepath:
             self.fn = join_path(basepath, '_metadata')  # effective file
         else:
             self.fn = '_metadata'
         self.fmd = fmd
         self._set_attrs()
     elif hasattr(fn, 'read'):
         # file-like
         self.fn = None
         self._parse_header(fn, verify)
         if self.file_scheme not in ['simple', 'empty']:
             raise ValueError('Cannot use file-like input '
                              'with multi-file data')
         open_with = lambda *args, **kwargs: fn
     else:
         if fs is not None:
             fn = fs._strip_protocol(fn)
         if not isinstance(fs, fsspec.AbstractFileSystem):
             raise ValueError(
                 "Opening directories without a _metadata requires"
                 "a filesystem compatible with fsspec")
         if fs.isfile(fn):
             self.fn = join_path(fn)
             with open_with(fn, 'rb') as f:
                 self._parse_header(f, verify)
         elif "*" in fn or fs.isdir(fn):
             fn2 = join_path(fn, '_metadata')
             if fs.exists(fn2):
                 self.fn = fn2
                 with open_with(fn2, 'rb') as f:
                     self._parse_header(f, verify)
                 fn = fn2
             else:
                 # TODO: get details from fs here, rather than do suffix cat in
                 #  metadata_from_many
                 if "*" in fn:
                     allfiles = fs.glob(fn)
                 else:
                     allfiles = [
                         f for f in fs.find(fn)
                         if f.endswith(".parquet") or f.endswith(".parq")
                     ]
                 if not allfiles:
                     raise ValueError("No files in dir")
                 basepath, fmd = metadata_from_many(allfiles,
                                                    verify_schema=verify,
                                                    open_with=open_with,
                                                    root=root,
                                                    fs=fs)
                 if basepath:
                     self.fn = join_path(basepath,
                                         '_metadata')  # effective file
                 else:
                     self.fn = '_metadata'
                 self.fmd = fmd
                 self._set_attrs()
         else:
             raise FileNotFoundError
     self.open = open_with
     self.sep = sep
     self._statistics = None
Exemple #22
0
def write(filename, data, row_group_offsets=50000000,
          compression=None, file_scheme='simple', open_with=default_open,
          mkdirs=default_mkdirs, has_nulls=True, write_index=None,
          partition_on=[], fixed_text=None, append=False,
          object_encoding='infer', times='int64'):
    """ Write Pandas DataFrame to filename as Parquet Format

    Parameters
    ----------
    filename: string
        Parquet collection to write to, either a single file (if file_scheme
        is simple) or a directory containing the metadata and data-files.
    data: pandas dataframe
        The table to write
    row_group_offsets: int or list of ints
        If int, row-groups will be approximately this many rows, rounded down
        to make row groups about the same size; if a list, the explicit index
        values to start new row groups.
    compression: str, dict
        compression to apply to each column, e.g. ``GZIP`` or ``SNAPPY`` or a
        ``dict`` like ``{"col1": "SNAPPY", "col2": None}`` to specify per
        column compression types.
        In both cases, the compressor settings would be the underlying
        compressor defaults. To pass arguments to the underlying compressor,
        each ``dict`` entry should itself be a dictionary::

            {
                col1: {
                    "type": "LZ4",
                    "args": {
                        "compression_level": 6,
                        "content_checksum": True
                     }
                },
                col2: {
                    "type": "SNAPPY",
                    "args": None
                }
                "_default": {
                    "type": "GZIP",
                    "args": None
                }
            }

        where ``"type"`` specifies the compression type to use, and ``"args"``
        specifies a ``dict`` that will be turned into keyword arguments for
        the compressor.
        If the dictionary contains a "_default" entry, this will be used for any
        columns not explicitly specified in the dictionary.
    file_scheme: 'simple'|'hive'
        If simple: all goes in a single file
        If hive: each row group is in a separate file, and a separate file
        (called "_metadata") contains the metadata.
    open_with: function
        When called with a f(path, mode), returns an open file-like object
    mkdirs: function
        When called with a path/URL, creates any necessary dictionaries to
        make that location writable, e.g., ``os.makedirs``. This is not
        necessary if using the simple file scheme
    has_nulls: bool, 'infer' or list of strings
        Whether columns can have nulls. If a list of strings, those given
        columns will be marked as "optional" in the metadata, and include
        null definition blocks on disk. Some data types (floats and times)
        can instead use the sentinel values NaN and NaT, which are not the same
        as NULL in parquet, but functionally act the same in many cases,
        particularly if converting back to pandas later. A value of 'infer'
        will assume nulls for object columns and not otherwise.
    write_index: boolean
        Whether or not to write the index to a separate column.  By default we
        write the index *if* it is not 0, 1, ..., n.
    partition_on: list of column names
        Passed to groupby in order to split data within each row-group,
        producing a structured directory tree. Note: as with pandas, null
        values will be dropped. Ignored if file_scheme is simple.
    fixed_text: {column: int length} or None
        For bytes or str columns, values will be converted
        to fixed-length strings of the given length for the given columns
        before writing, potentially providing a large speed
        boost. The length applies to the binary representation *after*
        conversion for utf8, json or bson.
    append: bool (False)
        If False, construct data-set from scratch; if True, add new row-group(s)
        to existing data-set. In the latter case, the data-set must exist,
        and the schema must match the input data.
    object_encoding: str or {col: type}
        For object columns, this gives the data type, so that the values can
        be encoded to bytes. Possible values are bytes|utf8|json|bson|bool|int|int32,
        where bytes is assumed if not specified (i.e., no conversion). The
        special value 'infer' will cause the type to be guessed from the first
        ten non-null values.
    times: 'int64' (default), or 'int96':
        In "int64" mode, datetimes are written as 8-byte integers, us
        resolution; in "int96" mode, they are written as 12-byte blocks, with
        the first 8 bytes as ns within the day, the next 4 bytes the julian day.
        'int96' mode is included only for compatibility.

    Examples
    --------
    >>> fastparquet.write('myfile.parquet', df)  # doctest: +SKIP
    """
    if str(has_nulls) == 'infer':
        has_nulls = None
    if isinstance(row_group_offsets, int):
        l = len(data)
        nparts = max((l - 1) // row_group_offsets + 1, 1)
        chunksize = max(min((l - 1) // nparts + 1, l), 1)
        row_group_offsets = list(range(0, l, chunksize))
    if write_index or write_index is None and index_like(data.index):
        cols = set(data)
        data = data.reset_index()
        index_cols = [c for c in data if c not in cols]
    else:
        index_cols = []
    check_column_names(data.columns, partition_on, fixed_text, object_encoding,
                       has_nulls)
    ignore = partition_on if file_scheme != 'simple' else []
    fmd = make_metadata(data, has_nulls=has_nulls, ignore_columns=ignore,
                        fixed_text=fixed_text, object_encoding=object_encoding,
                        times=times, index_cols=index_cols)

    if file_scheme == 'simple':
        write_simple(filename, data, fmd, row_group_offsets,
                     compression, open_with, has_nulls, append)
    elif file_scheme in ['hive', 'drill']:
        if append:
            pf = api.ParquetFile(filename, open_with=open_with)
            if pf.file_scheme not in ['hive', 'empty', 'flat']:
                raise ValueError('Requested file scheme is %s, but '
                                 'existing file scheme is not.' % file_scheme)
            fmd = pf.fmd
            i_offset = find_max_part(fmd.row_groups)
            if tuple(partition_on) != tuple(pf.cats):
                raise ValueError('When appending, partitioning columns must'
                                 ' match existing data')
        else:
            i_offset = 0
        fn = join_path(filename, '_metadata')
        mkdirs(filename)
        for i, start in enumerate(row_group_offsets):
            end = (row_group_offsets[i+1] if i < (len(row_group_offsets) - 1)
                   else None)
            part = 'part.%i.parquet' % (i + i_offset)
            if partition_on:
                rgs = partition_on_columns(
                    data[start:end], partition_on, filename, part, fmd,
                    compression, open_with, mkdirs,
                    with_field=file_scheme == 'hive'
                )
                fmd.row_groups.extend(rgs)
            else:
                partname = join_path(filename, part)
                with open_with(partname, 'wb') as f2:
                    rg = make_part_file(f2, data[start:end], fmd.schema,
                                        compression=compression, fmd=fmd)
                for chunk in rg.columns:
                    chunk.file_path = part

                fmd.row_groups.append(rg)

        write_common_metadata(fn, fmd, open_with, no_row_groups=False)
        write_common_metadata(join_path(filename, '_common_metadata'), fmd,
                              open_with)
    else:
        raise ValueError('File scheme should be simple|hive, not', file_scheme)
Exemple #23
0
def time_column():
    with tmpdir() as tempdir:
        result = {}
        fn = join_path(tempdir, 'temp.parq')
        n = 10000000
        r = np.random.randint(-1e10, 1e10, n, dtype='int64')
        d = pd.DataFrame({'w': pd.Categorical(np.random.choice(
                ['hi', 'you', 'people'], size=n)),
                          'x': r.view('timedelta64[ns]'),
                          'y': r / np.random.randint(1, 1000, size=n),
                          'z': np.random.randint(0, 127, size=n,
                                                 dtype=np.uint8)})
        d['b'] = r > 0

        for col in d.columns:
            df = d[[col]]
            write(fn, df)
            with measure('%s: write, no nulls' % d.dtypes[col], result):
                write(fn, df, has_nulls=False)

            pf = ParquetFile(fn)
            pf.to_pandas(categories={'w': 3})  # warm-up

            with measure('%s: read, no nulls' % d.dtypes[col], result):
                pf.to_pandas(categories={'w': 3})

            with measure('%s: write, no nulls, has_null=True' % d.dtypes[col], result):
                write(fn, df, has_nulls=True)

            pf = ParquetFile(fn)
            pf.to_pandas(categories={'w': 3})  # warm-up

            with measure('%s: read, no nulls, has_null=True' % d.dtypes[col], result):
                pf.to_pandas(categories={'w': 3})

            if d.dtypes[col].kind == 'm':
                d.loc[n//2, col] = pd.to_datetime('NaT')
            elif d.dtypes[col].kind == 'f':
                d.loc[n//2, col] = np.nan
            elif d.dtypes[col].kind in ['i', 'u']:
                continue
            else:
                d.loc[n//2, col] = None
            with measure('%s: write, with null, has_null=True' % d.dtypes[col], result):
                write(fn, df, has_nulls=True)

            pf = ParquetFile(fn)
            pf.to_pandas(categories={'w': 3})  # warm-up

            with measure('%s: read, with null, has_null=True' % d.dtypes[col], result):
                pf.to_pandas(categories={'w': 3})

            with measure('%s: write, with null, has_null=False' % d.dtypes[col], result):
                write(fn, df, has_nulls=False)

            pf = ParquetFile(fn)
            pf.to_pandas(categories={'w': 3})  # warm-up

            with measure('%s: read, with null, has_null=False' % d.dtypes[col], result):
                pf.to_pandas(categories={'w': 3})

        return result
Exemple #24
0
def test_empty():
    assert join_path("test", ""), "test"
Exemple #25
0
def test_empty():
    assert join_path("test", ""), "test"
Exemple #26
0
def time_column():
    with tmpdir() as tempdir:
        result = {}
        fn = join_path(tempdir, 'temp.parq')
        n = 10000000
        r = np.random.randint(-1e10, 1e10, n, dtype='int64')
        d = pd.DataFrame({
            'w':
            pd.Categorical(np.random.choice(['hi', 'you', 'people'], size=n)),
            'x':
            r.view('timedelta64[ns]'),
            'y':
            r / np.random.randint(1, 1000, size=n),
            'z':
            np.random.randint(0, 127, size=n, dtype=np.uint8)
        })

        for col in d.columns:
            df = d[[col]]
            write(fn, df)
            with measure('%s: write, no nulls' % d.dtypes[col], result):
                write(fn, df, has_nulls=False)

            pf = ParquetFile(fn)
            pf.to_pandas(categories={'w': 3})  # warm-up

            with measure('%s: read, no nulls' % d.dtypes[col], result):
                pf.to_pandas(categories={'w': 3})

            with measure('%s: write, no nulls, has_null=True' % d.dtypes[col],
                         result):
                write(fn, df, has_nulls=True)

            pf = ParquetFile(fn)
            pf.to_pandas(categories={'w': 3})  # warm-up

            with measure('%s: read, no nulls, has_null=True' % d.dtypes[col],
                         result):
                pf.to_pandas(categories={'w': 3})

            if d.dtypes[col].kind == 'm':
                d.loc[n // 2, col] = pd.to_datetime('NaT')
            elif d.dtypes[col].kind == 'f':
                d.loc[n // 2, col] = np.nan
            elif d.dtypes[col].kind in ['i', 'u']:
                continue
            else:
                d.loc[n // 2, col] = None
            with measure('%s: write, with null, has_null=True' % d.dtypes[col],
                         result):
                write(fn, df, has_nulls=True)

            pf = ParquetFile(fn)
            pf.to_pandas(categories={'w': 3})  # warm-up

            with measure('%s: read, with null, has_null=True' % d.dtypes[col],
                         result):
                pf.to_pandas(categories={'w': 3})

            with measure(
                    '%s: write, with null, has_null=False' % d.dtypes[col],
                    result):
                write(fn, df, has_nulls=False)

            pf = ParquetFile(fn)
            pf.to_pandas(categories={'w': 3})  # warm-up

            with measure('%s: read, with null, has_null=False' % d.dtypes[col],
                         result):
                pf.to_pandas(categories={'w': 3})

        return result
Exemple #27
0
def test_join_paths():
    assert join_path('/', 'this/is/a/test/') == '/this/is/a/test'
    assert join_path('', 'this/is/a/test/') == 'this/is/a/test'
Exemple #28
0
def write(filename, data, row_group_offsets=50000000,
          compression=None, file_scheme='simple', open_with=default_open,
          mkdirs=default_mkdirs, has_nulls=True, write_index=None,
          partition_on=[], fixed_text=None, append=False,
          object_encoding='infer', times='int64'):
    """ Write Pandas DataFrame to filename as Parquet Format

    Parameters
    ----------
    filename: string
        Parquet collection to write to, either a single file (if file_scheme
        is simple) or a directory containing the metadata and data-files.
    data: pandas dataframe
        The table to write
    row_group_offsets: int or list of ints
        If int, row-groups will be approximately this many rows, rounded down
        to make row groups about the same size; if a list, the explicit index
        values to start new row groups.
    compression: str, dict
        compression to apply to each column, e.g. ``GZIP`` or ``SNAPPY`` or a
        ``dict`` like ``{"col1": "SNAPPY", "col2": None}`` to specify per
        column compression types.
        In both cases, the compressor settings would be the underlying
        compressor defaults. To pass arguments to the underlying compressor,
        each ``dict`` entry should itself be a dictionary::

            {
                col1: {
                    "type": "LZ4",
                    "args": {
                        "mode": "high_compression",
                        "compression": 9
                     }
                },
                col2: {
                    "type": "SNAPPY",
                    "args": None
                }
                "_default": {
                    "type": "GZIP",
                    "args": None
                }
            }

        where ``"type"`` specifies the compression type to use, and ``"args"``
        specifies a ``dict`` that will be turned into keyword arguments for
        the compressor.
        If the dictionary contains a "_default" entry, this will be used for any
        columns not explicitly specified in the dictionary.
    file_scheme: 'simple'|'hive'
        If simple: all goes in a single file
        If hive: each row group is in a separate file, and a separate file
        (called "_metadata") contains the metadata.
    open_with: function
        When called with a f(path, mode), returns an open file-like object
    mkdirs: function
        When called with a path/URL, creates any necessary dictionaries to
        make that location writable, e.g., ``os.makedirs``. This is not
        necessary if using the simple file scheme
    has_nulls: bool, 'infer' or list of strings
        Whether columns can have nulls. If a list of strings, those given
        columns will be marked as "optional" in the metadata, and include
        null definition blocks on disk. Some data types (floats and times)
        can instead use the sentinel values NaN and NaT, which are not the same
        as NULL in parquet, but functionally act the same in many cases,
        particularly if converting back to pandas later. A value of 'infer'
        will assume nulls for object columns and not otherwise.
    write_index: boolean
        Whether or not to write the index to a separate column.  By default we
        write the index *if* it is not 0, 1, ..., n.
    partition_on: list of column names
        Passed to groupby in order to split data within each row-group,
        producing a structured directory tree. Note: as with pandas, null
        values will be dropped. Ignored if file_scheme is simple.
    fixed_text: {column: int length} or None
        For bytes or str columns, values will be converted
        to fixed-length strings of the given length for the given columns
        before writing, potentially providing a large speed
        boost. The length applies to the binary representation *after*
        conversion for utf8, json or bson.
    append: bool (False)
        If False, construct data-set from scratch; if True, add new row-group(s)
        to existing data-set. In the latter case, the data-set must exist,
        and the schema must match the input data.
    object_encoding: str or {col: type}
        For object columns, this gives the data type, so that the values can
        be encoded to bytes. Possible values are bytes|utf8|json|bson|bool|int|int32,
        where bytes is assumed if not specified (i.e., no conversion). The
        special value 'infer' will cause the type to be guessed from the first
        ten non-null values.
    times: 'int64' (default), or 'int96':
        In "int64" mode, datetimes are written as 8-byte integers, us
        resolution; in "int96" mode, they are written as 12-byte blocks, with
        the first 8 bytes as ns within the day, the next 4 bytes the julian day.
        'int96' mode is included only for compatibility.

    Examples
    --------
    >>> fastparquet.write('myfile.parquet', df)  # doctest: +SKIP
    """
    if str(has_nulls) == 'infer':
        has_nulls = None
    if isinstance(row_group_offsets, int):
        l = len(data)
        nparts = max((l - 1) // row_group_offsets + 1, 1)
        chunksize = max(min((l - 1) // nparts + 1, l), 1)
        row_group_offsets = list(range(0, l, chunksize))
    if (write_index or write_index is None
            and not isinstance(data.index, pd.RangeIndex)):
        cols = set(data)
        data = data.reset_index()
        index_cols = [c for c in data if c not in cols]
    elif isinstance(data.index, pd.RangeIndex):
        # write_index=None, range to metadata
        index_cols = data.index
    else:  # write_index=False
        index_cols = []
    check_column_names(data.columns, partition_on, fixed_text, object_encoding,
                       has_nulls)
    ignore = partition_on if file_scheme != 'simple' else []
    fmd = make_metadata(data, has_nulls=has_nulls, ignore_columns=ignore,
                        fixed_text=fixed_text, object_encoding=object_encoding,
                        times=times, index_cols=index_cols)

    if file_scheme == 'simple':
        write_simple(filename, data, fmd, row_group_offsets,
                     compression, open_with, has_nulls, append)
    elif file_scheme in ['hive', 'drill']:
        if append:
            pf = api.ParquetFile(filename, open_with=open_with)
            if pf.file_scheme not in ['hive', 'empty', 'flat']:
                raise ValueError('Requested file scheme is %s, but '
                                 'existing file scheme is not.' % file_scheme)
            fmd = pf.fmd
            i_offset = find_max_part(fmd.row_groups)
            if tuple(partition_on) != tuple(pf.cats):
                raise ValueError('When appending, partitioning columns must'
                                 ' match existing data')
        else:
            i_offset = 0
        fn = join_path(filename, '_metadata')
        mkdirs(filename)
        for i, start in enumerate(row_group_offsets):
            end = (row_group_offsets[i+1] if i < (len(row_group_offsets) - 1)
                   else None)
            part = 'part.%i.parquet' % (i + i_offset)
            if partition_on:
                rgs = partition_on_columns(
                    data[start:end], partition_on, filename, part, fmd,
                    compression, open_with, mkdirs,
                    with_field=file_scheme == 'hive'
                )
                fmd.row_groups.extend(rgs)
            else:
                partname = join_path(filename, part)
                with open_with(partname, 'wb') as f2:
                    rg = make_part_file(f2, data[start:end], fmd.schema,
                                        compression=compression, fmd=fmd)
                for chunk in rg.columns:
                    chunk.file_path = part

                fmd.row_groups.append(rg)

        fmd.num_rows = sum(rg.num_rows for rg in fmd.row_groups)
        write_common_metadata(fn, fmd, open_with, no_row_groups=False)
        write_common_metadata(join_path(filename, '_common_metadata'), fmd,
                              open_with)
    else:
        raise ValueError('File scheme should be simple|hive, not', file_scheme)
Exemple #29
0
def time_column():
    with tmpdir() as tempdir:
        result = {}
        fn = join_path('temp.parq')
        n = 10000000
        r = np.random.randint(-1e10, 1e10, n, dtype='int64')
        d = pd.DataFrame({
            'w':
            pd.Categorical(np.random.choice(['hi', 'you', 'people'], size=n)),
            'x':
            r.view('timedelta64[ns]'),
            'y':
            r / np.random.randint(1, 1000, size=n),
            'z':
            np.random.randint(0, 127, size=n, dtype=np.uint8)
        })
        d['b'] = r > 0

        for col in d.columns:
            df = d[[col]]
            write(fn, df)
            with measure('%s: write, no nulls' % d.dtypes[col], result):
                write(fn, df, has_nulls=False)  #, compression="SNAPPY")

            pf = ParquetFile(fn)
            with measure("file open", result):
                ParquetFile(fn)

            if col == 'x':
                assert (df.x.astype('timedelta64[us]') == df.x.astype(
                    'timedelta64[us]')).all()
            else:
                assert (pf.to_pandas() == df).values.all()  # warm-up

            with measure('%s: read, no nulls' % d.dtypes[col], result):
                pf.to_pandas()

            with measure('%s: write, no nulls, has_null=True' % d.dtypes[col],
                         result):
                write(fn, df, has_nulls=True)  #, compression="SNAPPY")

            pf = ParquetFile(fn)
            if col == 'x':
                assert (df.x.astype('timedelta64[us]') == df.x.astype(
                    'timedelta64[us]')).all()
            else:
                assert (pf.to_pandas() == df).values.all()  # warm-up

            with measure('%s: read, no nulls, has_null=True' % d.dtypes[col],
                         result):
                pf.to_pandas()

            if d.dtypes[col].kind == 'm':
                d.loc[n // 2, col] = pd.to_datetime('NaT')
            elif d.dtypes[col].kind == 'f':
                d.loc[n // 2, col] = np.nan
            elif d.dtypes[col].kind in ['i', 'u']:
                continue
            else:
                d.loc[n // 2, col] = None
            with measure('%s: write, with null, has_null=True' % d.dtypes[col],
                         result):
                write(fn, df, has_nulls=True)  #, compression="SNAPPY")

            pf = ParquetFile(fn)
            if col == 'x':
                assert (df.x.astype('timedelta64[us]') == df.x.astype(
                    'timedelta64[us]')).all()
            else:
                assert (pf.to_pandas() == df).values.all()  # warm-up

            with measure('%s: read, with null, has_null=True' % d.dtypes[col],
                         result):
                pf.to_pandas()

            with measure(
                    '%s: write, with null, has_null=False' % d.dtypes[col],
                    result):
                write(fn, df, has_nulls=False)  #, compression="SNAPPY")

            pf = ParquetFile(fn)
            if col == 'x':
                assert (df.x.astype('timedelta64[us]') == df.x.astype(
                    'timedelta64[us]')).all()
            else:
                assert (pf.to_pandas() == df).values.all()  # warm-up

            with measure('%s: read, with null, has_null=False' % d.dtypes[col],
                         result):
                pf.to_pandas()

        return result