Esempio n. 1
0
  def register_file_by_path(self, path):
    root, fname = os.path.split(path)
    tablename, _ = os.path.splitext(fname)
    fpath = os.path.join(root, fname)
    loaded = False
    exception = None
    for sep in [',', '|', '\t']:
      df = None # TODO delete
      columnar_tb = None
      try:
        with openfile(fpath) as f:
          df = pandas.read_csv(f, sep=sep) # TODO delete
        columnar_tb = pa_tb.from_pandas(df)
      except Exception as e:
        exception = e

      if df is not None and columnar_tb is not None:
        self.register_dataframe(tablename, df)
        self.register_columnar_tb(tablename, columnar_tb)
        loaded = True
        break

    if not loaded:
      print("Failed to read data file %s" % (fpath))
      print(exception)
Esempio n. 2
0
def _geopandas_to_arrow(df, index=None):
    """
    Helper function with main, shared logic for to_parquet/to_feather.
    """
    from pyarrow import Table

    warnings.warn(
        "this is an initial implementation of Parquet/Feather file support and "
        "associated metadata.  This is tracking version 0.1.0 of the metadata "
        "specification at "
        "https://github.com/geopandas/geo-arrow-spec\n\n"
        "This metadata specification does not yet make stability promises.  "
        "We do not yet recommend using this in a production setting unless you "
        "are able to rewrite your Parquet/Feather files.\n\n"
        "To further ignore this warning, you can do: \n"
        "import warnings; warnings.filterwarnings('ignore', "
        "message='.*initial implementation of Parquet.*')",
        UserWarning,
        stacklevel=4,
    )

    _validate_dataframe(df)

    # create geo metadata before altering incoming data frame
    geo_metadata = _create_metadata(df)

    df = _encode_wkb(df)

    table = Table.from_pandas(df, preserve_index=index)

    # Store geopandas specific file-level metadata
    # This must be done AFTER creating the table or it is not persisted
    metadata = table.schema.metadata
    metadata.update({b"geo": _encode_metadata(geo_metadata)})
    return table.replace_schema_metadata(metadata)
Esempio n. 3
0
def test_parquet_invalid_metadata(tmpdir, geo_meta, error):
    """Has geo metadata with missing required fields will raise a ValueError.

    This requires writing the parquet file directly below, so that we can
    control the metadata that is written for this test.
    """

    from pyarrow import parquet, Table

    test_dataset = "naturalearth_lowres"
    df = read_file(get_path(test_dataset))

    # convert to DataFrame and encode geometry to WKB
    df = DataFrame(df)
    df["geometry"] = to_wkb(df["geometry"].values)

    table = Table.from_pandas(df)
    metadata = table.schema.metadata
    metadata.update(geo_meta)
    table = table.replace_schema_metadata(metadata)

    filename = os.path.join(str(tmpdir), "test.pq")
    parquet.write_table(table, filename)

    with pytest.raises(ValueError, match=error):
        read_parquet(filename)
Esempio n. 4
0
 def data_table(self):
     df_to_sent = DataFrame(
         {
             'Brand':
             ['Honda Civic', 'Toyota Corolla', 'Ford Focus', 'Audi A4'],
             'Price': [22000, 25000, 27000, 35000]
         },
         columns=['Brand', 'Price'])
     return Table.from_pandas(df_to_sent)
Esempio n. 5
0
    def hand_in_result(self):
        """
    GroupBy works as follows:
    
    * Contruct and populate hash table with key defined by the group_exprs expressions  
    * Iterate through each bucket, compose and populate all tuples that conforms to 
      this operator's output schema (see self.init_schema)
    """
        handin_res = self.c.hand_in_result()
        if handin_res.is_terminate():
            return ListColumns(self.schema, None)

        # hash(key): [attr_pos, gr]
        hashtable = defaultdict(lambda: [None, None, []])

        # schema for non-aggregation project exprs
        termrow = ListColumns(self.group_term_schema)

        groupval_cols = []

        for expr in self.group_exprs:
            groupval_cols.append(expr(handin_res))

        if not groupval_cols:
            new_columns = []
            for expr in self.project_exprs:
                new_columns.append(expr(handin_res))
            return ListColumns(self.schema, new_columns)

        for idx in range(groupval_cols[0].length()):
            groupval = tuple([col[idx] for col in groupval_cols])
            key = hash(groupval)
            if not hashtable[key][0]:
                hashtable[key][0] = groupval
                hashtable[key][1] = [
                    attr(handin_res)[idx] for attr in self.group_attrs
                ]
            hashtable[key][2].append(idx)

        res_rows = []

        for _, (key, attrvals, group) in list(hashtable.items()):
            group_list_columns = ListColumns(
                handin_res.schema,
                [col.take(group) if col else None for col in handin_res])
            row = []
            for expr in self.project_exprs:
                if expr.is_type(AggFunc):
                    row.append(expr(group_list_columns).as_py())
                else:
                    termrow.columns = attrvals
                    row.append(expr(termrow).as_py())
            res_rows.append(row)

        return ListColumns(self.schema,
                           Table.from_pandas(pd.DataFrame(res_rows)).columns)
Esempio n. 6
0
def save_orc_file(dataframe, filepath):
    """Utility function to write dataframe to disk as orc file."""
    from pyarrow import Table, orc

    df = dataframe.copy()
    for c in df:
        if df[c].dtype.name == "category":
            df[c] = df[c].astype("string[pyarrow]")
    pa_table = Table.from_pandas(df, preserve_index=False)
    orc.write_table(pa_table, filepath)
Esempio n. 7
0
def convert_csv_to_parquet(infile, outfile):
    """ Convert csv file in parquet file """

    try:
        csv_data = pd.read_csv(infile, index_col=False, header=0)
        csv_table = Table.from_pandas(csv_data, preserve_index=True)
        if not outfile:
            outfile = get_file_name(infile) + '.parquet'
        parquet.write_table(csv_table, outfile)
    except BaseException as e:
        raise e
Esempio n. 8
0
def _geopandas_to_arrow(df, index=None):
    """
    Helper function with main, shared logic for to_parquet/to_feather.
    """
    from pyarrow import Table

    _validate_dataframe(df)

    # create geo metadata before altering incoming data frame
    geo_metadata = _create_metadata(df)

    df = df.to_wkb()

    table = Table.from_pandas(df, preserve_index=index)

    # Store geopandas specific file-level metadata
    # This must be done AFTER creating the table or it is not persisted
    metadata = table.schema.metadata
    metadata.update({b"geo": _encode_metadata(geo_metadata)})
    return table.replace_schema_metadata(metadata)
Esempio n. 9
0
def write_to_dataset(table,
                     root_path,
                     partition_cols=None,
                     filesystem=None,
                     preserve_index=True,
                     **kwargs):
    """
    Wrapper around parquet.write_table for writing a Table to
    Parquet format by partitions.
    For each combination of partition columns and values,
    a subdirectories are created in the following
    manner:

    root_dir/
      group1=value1
        group2=value1
          <uuid>.parquet
        group2=value2
          <uuid>.parquet
      group1=valueN
        group2=value1
          <uuid>.parquet
        group2=valueN
          <uuid>.parquet

    Parameters
    ----------
    table : pyarrow.Table
    root_path : string,
        The root directory of the dataset
    filesystem : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    preserve_index : bool,
        Parameter for instantiating Table; preserve pandas index or not.
    **kwargs : dict, kwargs for write_table function.
    """
    from pyarrow import (Table, compat)

    if filesystem is None:
        fs = LocalFileSystem.get_instance()
    else:
        fs = _ensure_filesystem(filesystem)

    _mkdir_if_not_exists(fs, root_path)

    if partition_cols is not None and len(partition_cols) > 0:
        df = table.to_pandas()
        partition_keys = [df[col] for col in partition_cols]
        data_df = df.drop(partition_cols, axis='columns')
        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")
        for keys, subgroup in data_df.groupby(partition_keys):
            if not isinstance(keys, tuple):
                keys = (keys, )
            subdir = "/".join([
                "{colname}={value}".format(colname=name, value=val)
                for name, val in zip(partition_cols, keys)
            ])
            subtable = Table.from_pandas(subgroup,
                                         preserve_index=preserve_index)
            prefix = "/".join([root_path, subdir])
            _mkdir_if_not_exists(fs, prefix)
            outfile = compat.guid() + ".parquet"
            full_path = "/".join([prefix, outfile])
            with fs.open(full_path, 'wb') as f:
                write_table(subtable, f, **kwargs)
    else:
        outfile = compat.guid() + ".parquet"
        full_path = "/".join([root_path, outfile])
        with fs.open(full_path, 'wb') as f:
            write_table(table, f, **kwargs)
Esempio n. 10
0
def write_to_dataset(table, root_path, partition_cols=None,
                     filesystem=None, preserve_index=True, **kwargs):
    """
    Wrapper around parquet.write_table for writing a Table to
    Parquet format by partitions.
    For each combination of partition columns and values,
    a subdirectories are created in the following
    manner:

    root_dir/
      group1=value1
        group2=value1
          <uuid>.parquet
        group2=value2
          <uuid>.parquet
      group1=valueN
        group2=value1
          <uuid>.parquet
        group2=valueN
          <uuid>.parquet

    Parameters
    ----------
    table : pyarrow.Table
    root_path : string,
        The root directory of the dataset
    filesystem : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    preserve_index : bool,
        Parameter for instantiating Table; preserve pandas index or not.
    **kwargs : dict, kwargs for write_table function.
    """
    from pyarrow import (
        Table,
        compat
    )

    if filesystem is None:
        fs = _get_fs_from_path(root_path)
    else:
        fs = _ensure_filesystem(filesystem)

    _mkdir_if_not_exists(fs, root_path)

    if partition_cols is not None and len(partition_cols) > 0:
        df = table.to_pandas()
        partition_keys = [df[col] for col in partition_cols]
        data_df = df.drop(partition_cols, axis='columns')
        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")
        for keys, subgroup in data_df.groupby(partition_keys):
            if not isinstance(keys, tuple):
                keys = (keys,)
            subdir = "/".join(
                ["{colname}={value}".format(colname=name, value=val)
                 for name, val in zip(partition_cols, keys)])
            subtable = Table.from_pandas(subgroup,
                                         preserve_index=preserve_index)
            prefix = "/".join([root_path, subdir])
            _mkdir_if_not_exists(fs, prefix)
            outfile = compat.guid() + ".parquet"
            full_path = "/".join([prefix, outfile])
            with fs.open(full_path, 'wb') as f:
                write_table(subtable, f, **kwargs)
    else:
        outfile = compat.guid() + ".parquet"
        full_path = "/".join([root_path, outfile])
        with fs.open(full_path, 'wb') as f:
            write_table(table, f, **kwargs)
Esempio n. 11
0
def _to_parquet(df, path, compression="snappy", index=None, **kwargs):
    """
    Write a GeoDataFrame to the Parquet format.

    Any geometry columns present are serialized to WKB format in the file.

    Requires 'pyarrow'.

    WARNING: this is an initial implementation of Parquet file support and
    associated metadata.  This is tracking version 0.1.0 of the metadata
    specification at:
    https://github.com/geopandas/geo-arrow-spec

    This metadata specification does not yet make stability promises.  As such,
    we do not yet recommend using this in a production setting unless you are
    able to rewrite your Parquet files.


    .. versionadded:: 0.8

    Parameters
    ----------
    path : str, path object
    compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
        Name of the compression to use. Use ``None`` for no compression.
    index : bool, default None
        If ``True``, always include the dataframe's index(es) as columns
        in the file output.
        If ``False``, the index(es) will not be written to the file.
        If ``None``, the index(ex) will be included as columns in the file
        output except `RangeIndex` which is stored as metadata only.
    kwargs
        Additional keyword arguments passed to pyarrow.parquet.write_table().
    """

    import_optional_dependency(
        "pyarrow.parquet", extra="pyarrow is required for Parquet support.")
    from pyarrow import parquet, Table

    warnings.warn(
        "this is an initial implementation of Parquet file support and "
        "associated metadata.  This is tracking version 0.1.0 of the metadata "
        "specification at "
        "https://github.com/geopandas/geo-arrow-spec\n\n"
        "This metadata specification does not yet make stability promises.  "
        "We do not yet recommend using this in a production setting unless you "
        "are able to rewrite your Parquet files.\n\n"
        "To further ignore this warning, you can do: \n"
        "import warnings; warnings.filterwarnings('ignore', "
        "message='.*initial implementation of Parquet.*')",
        UserWarning,
        stacklevel=3,
    )

    _validate_dataframe(df)

    # create geo metadata before altering incoming data frame
    geo_metadata = _create_metadata(df)

    df = _encode_wkb(df)

    table = Table.from_pandas(df, preserve_index=index)

    # Store geopandas specific file-level metadata
    # This must be done AFTER creating the table or it is not persisted
    metadata = table.schema.metadata
    metadata.update({b"geo": _encode_metadata(geo_metadata)})

    table = table.replace_schema_metadata(metadata)
    parquet.write_table(table, path, compression=compression, **kwargs)
Esempio n. 12
0
 def write(self, df, path):
     self._check_no_duplicate_cols(df)
     if self._check_dtypes:
         self._check_no_categorical_cols(df)
     with path.open('wb') as file_:
         parquet.write_table(Table.from_pandas(df), file_)
Esempio n. 13
0
 def write_path(self, dataset):
     table = Table.from_pandas(dataset)
     write_table(table, self.path)
Esempio n. 14
0
def upsert_to_dataset(table, root_path, partition_cols=None,
                      filesystem=None, preserve_index=True,
                      temp_folder=None, categories=None, **kwargs):
    if filesystem is None:
        fs = _get_fs_from_path(root_path)
    else:
        fs = _ensure_filesystem(filesystem)

    _mkdir_if_not_exists(fs, root_path)
    if temp_folder:
        if not os.path.exists(temp_folder):
            temp_folder = None

    if partition_cols is not None and len(partition_cols) > 0:
        # df is the data in the new table
        df = table.to_pandas()
        partition_keys = [df[col] for col in partition_cols]
        data_df = df.drop(partition_cols, axis='columns')
        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")
        subschema = table.schema
        # ARROW-2891: Ensure the output_schema is preserved when writing a
        # partitioned dataset
        for partition_col in partition_cols:
            subschema = subschema.remove(
                subschema.get_field_index(partition_col))
        for keys, subgroup in data_df.groupby(partition_keys):
            if not isinstance(keys, tuple):
                keys = (keys,)
            subdir = "/".join(
                ["{colname}={value}".format(colname=name, value=val)
                 for name, val in zip(partition_cols, keys)])

            prefix = "/".join([root_path, subdir])
            _mkdir_if_not_exists(fs, prefix)
            existing_files = [f for f in os.listdir(prefix) if f.endswith('.parquet')]
            if len(existing_files) > 1:
                raise ValueError('Unsupported scenario, multiple files found in path %s' % prefix)
            if len(existing_files) == 1:
                outfile = existing_files[0]
                full_path = "/".join([prefix, outfile])
                old_table = read_table(full_path)
                category_cols = _to_category_cols(subgroup, categories)  # get categories before merging
                old_subgroup = old_table.to_pandas()
                # TODO: compare old schema with new
                subgroup = _upsert_dataframes(subgroup, old_subgroup)
                # subgroup = pd.concat([subgroup, old_subgroup[~old_subgroup.index.isin(subgroup.index.values)]])
                for c, v in category_cols.items():
                    subgroup.loc[:, c] = subgroup.loc[:, c].astype('category', categories=v)
            else:
                outfile = compat.guid() + ".parquet"
                full_path = "/".join([prefix, outfile])
            subtable = Table.from_pandas(subgroup,
                                         preserve_index=preserve_index,
                                         schema=subschema)
            write_file = os.path.join(temp_folder, outfile) if temp_folder else full_path
            with fs.open(write_file, 'wb') as f:
                write_table(subtable, f, **kwargs)
            if temp_folder:
                shutil.move(write_file, full_path)
    else:
        existing_files = [f for f in os.listdir(root_path) if f.endswith('.parquet')]
        if len(existing_files) > 1:
            raise ValueError('Unsupported scenario, multiple files found in path %s' % root_path)
        if len(existing_files) == 1:
            # append use case
            outfile = existing_files[0]
            full_path = "/".join([root_path, outfile])
            old_table = read_table(full_path)
            subgroup = table.to_pandas()
            category_cols = _to_category_cols(subgroup, categories)
            old_subgroup = old_table.to_pandas()
            # TODO: compare old schema with new
            subgroup = _upsert_dataframes(subgroup, old_subgroup)
            # subgroup = pd.concat([old_subgroup[~old_subgroup.index.isin(subgroup.index)], subgroup])
            for c, v in category_cols.items():
                subgroup.loc[:, c] = subgroup.loc[:, c].astype('category', categories=v)
            schema = table.schema
            table = Table.from_pandas(
                subgroup,
                preserve_index=preserve_index,
                schema=schema
            )
        else:
            # write use case
            outfile = compat.guid() + ".parquet"
            full_path = "/".join([root_path, outfile])

        write_file = os.path.join(temp_folder, outfile) if temp_folder else full_path
        with fs.open(write_file, 'wb') as f:
            write_table(table, f, **kwargs)
        if temp_folder:
            shutil.move(write_file, full_path)
Esempio n. 15
0
from pandas import DataFrame
from pyarrow import parquet, Table

data_file = '1000SalesRecords.csv'
separator_char = ','

with open(data_file, 'r') as file:
    headers = [i.strip().strip('"') for i in file.readline().strip().split(separator_char)]
    dict_of_lists = {i: [] for i in headers}
    file_data = file.read().splitlines()
    for line in file_data:
        split_line = line.strip().split(separator_char)
        for col_name, val in zip(headers, split_line):
            dict_of_lists[col_name].append(val.strip())

df = DataFrame(dict_of_lists)

table = Table.from_pandas(df=df, preserve_index=False)
parquet.write_table(table, 'example_noindex.parquet')
Esempio n. 16
0
import numpy as np
import pandas as pd
import pickle
import os
from datetime import datetime
import pyarrow.parquet as pq
from pyarrow import Table
import multiprocessing as mp
import collections
import itertools

# Creates a miniature dataset for testing purposes
# with the first 20000 entries.

filename = 'part-00000-8bbff892-97d2-4011-9961-703e38972569.c000.snappy.parquet'

df = pq.read_table(filename).to_pandas()

mini_df = df.head(20000)
table = Table.from_pandas(mini_df, nthreads=1)
pq.write_table(table, 'mini.parquet')
Esempio n. 17
0
 def write(self, df, file_):
     if self._check_dtypes:
         self._check_no_categorical_cols(df)
     parquet.write_table(Table.from_pandas(df), file_)
Esempio n. 18
0
def from_frame(data, index=False, schema=None):
    return Table.from_pandas(data, preserve_index=index, schema=schema)
Esempio n. 19
0
def to_arrow(df):
    return Table.from_pandas(df)