Ejemplo n.º 1
0
def _write_dataframe(dataframe, path):
    path = data_util.make_data_path(path)

    _make_directory_if_needed(path)

    table = pa.Table.from_pandas(dataframe, preserve_index=False)
    pq.write_table(table, path)
Ejemplo n.º 2
0
def _write_dataframe(dataframe, path):
    if dataframe.columns.empty:
        raise Exception('Empty DataFrame cannot be written.')

    if not path.startswith('hdfs://'):
        path = data_util.make_data_path(path)

    _remove_exist_directory_if_dir(path)
    _make_directory_if_needed(path)

    table = pa.Table.from_pandas(dataframe, preserve_index=False)
    pq.write_table(table, path)
Ejemplo n.º 3
0
def write_to_dataset(table, root_path, partition_cols=None, **kwargs):
    _make_directory_if_needed(brtc_data_utils.make_data_path(root_path))

    if partition_cols is not None and len(partition_cols) > 0:
        df = table.to_pandas()
        partition_keys = [df[col] for col in partition_cols]
        data_df = df.drop(partition_cols, axis='columns')
        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")
        for keys, subgroup in data_df.groupby(partition_keys):
            if not isinstance(keys, tuple):
                keys = (keys, )
            subdir = "/".join([
                "{colname}={value}".format(colname=name, value=val)
                for name, val in zip(partition_cols, keys)
            ])
            subtable = pa.Table.from_pandas(subgroup, preserve_index=False)
            prefix = "/".join([root_path, subdir])
            _make_directory_if_needed(brtc_data_utils.make_data_path(prefix))
            outfile = compat.guid() + ".parquet"
            full_path = "/".join([prefix, outfile])
            full_path = brtc_data_utils.make_data_path(full_path)
            _make_directory_if_needed(
                brtc_data_utils.make_data_path(full_path))
            pq.write_table(subtable, brtc_data_utils.make_data_path(full_path),
                           **kwargs)
    else:
        outfile = compat.guid() + ".parquet"
        full_path = "/".join([root_path, outfile])
        pq.write_table(table, brtc_data_utils.make_data_path(full_path),
                       **kwargs)
Ejemplo n.º 4
0
def to_parquet(df, path, njobs=4):
    print(path)
    path = brtc_data_utils.make_data_path(path)
    print(path)
    os.makedirs(path)

    pool = multiprocessing.pool.ThreadPool()

    paths = []
    for grp, sample in df.groupby(
            lambda _: np.random.choice(range(njobs), 1)[0]):
        sub_path = os.path.join(path, '{}'.format(grp))
        paths.append(sub_path)
        pool.apply_async(_write_parquet, (sample, sub_path))

    pool.close()
    pool.join()
    return paths
Ejemplo n.º 5
0
def read_parquet(path):
    return table_reader.read_parquet(data_util.make_data_path(path))