def _write_partition_fastparquet(df, fs, path, filename, fmd, compression, partition_on): from fastparquet.writer import partition_on_columns, make_part_file import fastparquet # Fastparquet mutates this in a non-threadsafe manner. For now we just copy # it before forwarding to fastparquet. fmd = copy.copy(fmd) if not len(df): # Write nothing for empty partitions rgs = None elif partition_on: if LooseVersion(fastparquet.__version__) >= '0.1.4': rgs = partition_on_columns(df, partition_on, path, filename, fmd, compression, fs.open, fs.mkdirs) else: rgs = partition_on_columns(df, partition_on, path, filename, fmd, fs.sep, compression, fs.open, fs.mkdirs) else: # Fastparquet current doesn't properly set `num_rows` in the output # metadata. Set it here to fix that. fmd.num_rows = len(df) with fs.open(fs.sep.join([path, filename]), 'wb') as fil: rgs = make_part_file(fil, df, fmd.schema, compression=compression, fmd=fmd) return rgs
def _write_partition_fastparquet(df, fs, path, filename, fmd, compression, partition_on): from fastparquet.writer import partition_on_columns, make_part_file import fastparquet # Fastparquet mutates this in a non-threadsafe manner. For now we just copy # it before forwarding to fastparquet. fmd = copy.copy(fmd) if not len(df): # Write nothing for empty partitions rgs = None elif partition_on: if LooseVersion(fastparquet.__version__) >= '0.1.4': rgs = partition_on_columns(df, partition_on, path, filename, fmd, compression, fs.open, fs.mkdirs) else: rgs = partition_on_columns(df, partition_on, path, filename, fmd, fs.sep, compression, fs.open, fs.mkdirs) else: # Fastparquet current doesn't properly set `num_rows` in the output # metadata. Set it here to fix that. fmd.num_rows = len(df) with fs.open(fs.sep.join([path, filename]), 'wb') as fil: rgs = make_part_file(fil, df, fmd.schema, compression=compression, fmd=fmd) return rgs
def write_partition( cls, df, path, fs, filename, partition_on, return_metadata, fmd=None, compression=None, custom_metadata=None, **kwargs, ): # Update key/value metadata if necessary fmd = copy.copy(fmd) if custom_metadata and fmd is not None: fmd.key_value_metadata.extend([ fastparquet.parquet_thrift.KeyValue(key=key, value=value) for key, value in custom_metadata.items() ]) if not len(df): # Write nothing for empty partitions rgs = [] elif partition_on: mkdirs = lambda x: fs.mkdirs(x, exist_ok=True) if parse_version( fastparquet.__version__) >= parse_version("0.1.4"): rgs = partition_on_columns(df, partition_on, path, filename, fmd, compression, fs.open, mkdirs) else: rgs = partition_on_columns( df, partition_on, path, filename, fmd, fs.sep, compression, fs.open, mkdirs, ) else: with fs.open(fs.sep.join([path, filename]), "wb") as fil: fmd.num_rows = len(df) rg = make_part_file(fil, df, fmd.schema, compression=compression, fmd=fmd) for chunk in rg.columns: chunk.file_path = filename rgs = [rg] if return_metadata: return rgs else: return []
def write_partition( cls, df, path, fs, filename, partition_on, return_metadata, fmd=None, compression=None, **kwargs, ): fmd = copy.copy(fmd) if not len(df): # Write nothing for empty partitions rgs = [] elif partition_on: mkdirs = lambda x: fs.mkdirs(x, exist_ok=True) if LooseVersion(fastparquet.__version__) >= "0.1.4": rgs = partition_on_columns(df, partition_on, path, filename, fmd, compression, fs.open, mkdirs) else: rgs = partition_on_columns( df, partition_on, path, filename, fmd, fs.sep, compression, fs.open, mkdirs, ) else: with fs.open(fs.sep.join([path, filename]), "wb") as fil: fmd.num_rows = len(df) rg = make_part_file(fil, df, fmd.schema, compression=compression, fmd=fmd) for chunk in rg.columns: chunk.file_path = filename rgs = [rg] if return_metadata: return rgs else: return []