コード例 #1
0
ファイル: parquet.py プロジェクト: ruipengliu/dask
def _write_partition_fastparquet(df, fs, path, filename, fmd, compression,
                                 partition_on):
    from fastparquet.writer import partition_on_columns, make_part_file
    import fastparquet
    # Fastparquet mutates this in a non-threadsafe manner. For now we just copy
    # it before forwarding to fastparquet.
    fmd = copy.copy(fmd)
    if not len(df):
        # Write nothing for empty partitions
        rgs = None
    elif partition_on:
        if LooseVersion(fastparquet.__version__) >= '0.1.4':
            rgs = partition_on_columns(df, partition_on, path, filename, fmd,
                                       compression, fs.open, fs.mkdirs)
        else:
            rgs = partition_on_columns(df, partition_on, path, filename, fmd,
                                       fs.sep, compression, fs.open, fs.mkdirs)
    else:
        # Fastparquet current doesn't properly set `num_rows` in the output
        # metadata. Set it here to fix that.
        fmd.num_rows = len(df)
        with fs.open(fs.sep.join([path, filename]), 'wb') as fil:
            rgs = make_part_file(fil, df, fmd.schema, compression=compression,
                                 fmd=fmd)
    return rgs
コード例 #2
0
ファイル: parquet.py プロジェクト: yliapis/dask
def _write_partition_fastparquet(df, fs, path, filename, fmd, compression,
                                 partition_on):
    from fastparquet.writer import partition_on_columns, make_part_file
    import fastparquet
    # Fastparquet mutates this in a non-threadsafe manner. For now we just copy
    # it before forwarding to fastparquet.
    fmd = copy.copy(fmd)
    if not len(df):
        # Write nothing for empty partitions
        rgs = None
    elif partition_on:
        if LooseVersion(fastparquet.__version__) >= '0.1.4':
            rgs = partition_on_columns(df, partition_on, path, filename, fmd,
                                       compression, fs.open, fs.mkdirs)
        else:
            rgs = partition_on_columns(df, partition_on, path, filename, fmd,
                                       fs.sep, compression, fs.open, fs.mkdirs)
    else:
        # Fastparquet current doesn't properly set `num_rows` in the output
        # metadata. Set it here to fix that.
        fmd.num_rows = len(df)
        with fs.open(fs.sep.join([path, filename]), 'wb') as fil:
            rgs = make_part_file(fil, df, fmd.schema, compression=compression,
                                 fmd=fmd)
    return rgs
コード例 #3
0
ファイル: fastparquet.py プロジェクト: vijaykriishna/dask
    def write_partition(
        cls,
        df,
        path,
        fs,
        filename,
        partition_on,
        return_metadata,
        fmd=None,
        compression=None,
        custom_metadata=None,
        **kwargs,
    ):
        # Update key/value metadata if necessary
        fmd = copy.copy(fmd)
        if custom_metadata and fmd is not None:
            fmd.key_value_metadata.extend([
                fastparquet.parquet_thrift.KeyValue(key=key, value=value)
                for key, value in custom_metadata.items()
            ])

        if not len(df):
            # Write nothing for empty partitions
            rgs = []
        elif partition_on:
            mkdirs = lambda x: fs.mkdirs(x, exist_ok=True)
            if parse_version(
                    fastparquet.__version__) >= parse_version("0.1.4"):
                rgs = partition_on_columns(df, partition_on, path, filename,
                                           fmd, compression, fs.open, mkdirs)
            else:
                rgs = partition_on_columns(
                    df,
                    partition_on,
                    path,
                    filename,
                    fmd,
                    fs.sep,
                    compression,
                    fs.open,
                    mkdirs,
                )
        else:
            with fs.open(fs.sep.join([path, filename]), "wb") as fil:
                fmd.num_rows = len(df)
                rg = make_part_file(fil,
                                    df,
                                    fmd.schema,
                                    compression=compression,
                                    fmd=fmd)
            for chunk in rg.columns:
                chunk.file_path = filename
            rgs = [rg]
        if return_metadata:
            return rgs
        else:
            return []
コード例 #4
0
 def write_partition(
     cls,
     df,
     path,
     fs,
     filename,
     partition_on,
     return_metadata,
     fmd=None,
     compression=None,
     **kwargs,
 ):
     fmd = copy.copy(fmd)
     if not len(df):
         # Write nothing for empty partitions
         rgs = []
     elif partition_on:
         mkdirs = lambda x: fs.mkdirs(x, exist_ok=True)
         if LooseVersion(fastparquet.__version__) >= "0.1.4":
             rgs = partition_on_columns(df, partition_on, path, filename,
                                        fmd, compression, fs.open, mkdirs)
         else:
             rgs = partition_on_columns(
                 df,
                 partition_on,
                 path,
                 filename,
                 fmd,
                 fs.sep,
                 compression,
                 fs.open,
                 mkdirs,
             )
     else:
         with fs.open(fs.sep.join([path, filename]), "wb") as fil:
             fmd.num_rows = len(df)
             rg = make_part_file(fil,
                                 df,
                                 fmd.schema,
                                 compression=compression,
                                 fmd=fmd)
         for chunk in rg.columns:
             chunk.file_path = filename
         rgs = [rg]
     if return_metadata:
         return rgs
     else:
         return []