Example #1
0
    def test_read_multiple_parquet_files(self):
        import pyarrow.parquet as pq

        nfiles = 10
        size = 5

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid())

        self.hdfs.mkdir(tmpdir)

        test_data = []
        paths = []
        for i in range(nfiles):
            df = test_parquet._test_dataframe(size, seed=i)

            df['index'] = np.arange(i * size, (i + 1) * size)

            # Hack so that we don't have a dtype cast in v1 files
            df['uint32'] = df['uint32'].astype(np.int64)

            path = pjoin(tmpdir, '{0}.parquet'.format(i))

            table = pa.Table.from_pandas(df, preserve_index=False)
            with self.hdfs.open(path, 'wb') as f:
                pq.write_table(table, f)

            test_data.append(table)
            paths.append(path)

        result = self.hdfs.read_parquet(tmpdir)
        expected = pa.concat_tables(test_data)

        pdt.assert_frame_equal(result.to_pandas()
                               .sort_values(by='index').reset_index(drop=True),
                               expected.to_pandas())
Example #2
0
def test_read_single_row_group():
    # ARROW-471
    N, K = 10000, 4
    df = alltypes_sample(size=N)

    a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)

    buf = io.BytesIO()
    pq.write_table(a_table, buf, row_group_size=N / K,
                   compression='snappy', version='2.0')

    buf.seek(0)

    pf = pq.ParquetFile(buf)

    assert pf.num_row_groups == K

    row_groups = [pf.read_row_group(i) for i in range(K)]
    result = pa.concat_tables(row_groups)
    pdt.assert_frame_equal(df, result.to_pandas())

    cols = df.columns[:2]
    row_groups = [pf.read_row_group(i, columns=cols)
                  for i in range(K)]
    result = pa.concat_tables(row_groups)
    pdt.assert_frame_equal(df[cols], result.to_pandas())
Example #3
0
def test_pandas_parquet_1_0_rountrip(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'str': [str(x) for x in range(size)],
        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
        'empty_str': [''] * size
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df)
    pq.write_table(arrow_table, filename.strpath, version="1.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()

    # We pass uint32_t as int64_t if we write Parquet version 1.0
    df['uint32'] = df['uint32'].values.astype(np.int64)

    pdt.assert_frame_equal(df, df_read)
Example #4
0
def test_pandas_parquet_configuration_options(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df)

    for use_dictionary in [True, False]:
        pq.write_table(arrow_table, filename.strpath,
                       version="2.0",
                       use_dictionary=use_dictionary)
        table_read = pq.read_table(filename.strpath)
        df_read = table_read.to_pandas()
        pdt.assert_frame_equal(df, df_read)

    for compression in ['NONE', 'SNAPPY', 'GZIP']:
        pq.write_table(arrow_table, filename.strpath,
                       version="2.0",
                       compression=compression)
        table_read = pq.read_table(filename.strpath)
        df_read = table_read.to_pandas()
        pdt.assert_frame_equal(df, df_read)
def _write_table(table, path, **kwargs):
    import pyarrow.parquet as pq

    if isinstance(table, pd.DataFrame):
        table = pa.Table.from_pandas(table)

    pq.write_table(table, path, **kwargs)
    return table
Example #6
0
def make_sample_file(df):
    a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)

    buf = io.BytesIO()
    pq.write_table(a_table, buf, compression='SNAPPY', version='2.0')

    buf.seek(0)
    return pq.ParquetFile(buf)
Example #7
0
def test_column_of_lists(tmpdir):
    df, schema = dataframe_with_arrays()

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, schema=schema)
    pq.write_table(arrow_table, filename.strpath, version="2.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
Example #8
0
def test_pandas_parquet_native_file_roundtrip(tmpdir):
    df = _test_dataframe(10000)
    arrow_table = A.from_pandas_dataframe(df)
    imos = paio.InMemoryOutputStream()
    pq.write_table(arrow_table, imos, version="2.0")
    buf = imos.get_result()
    reader = paio.BufferReader(buf)
    df_read = pq.read_table(reader).to_pandas()
    pdt.assert_frame_equal(df, df_read)
Example #9
0
def test_pandas_parquet_2_0_rountrip(tmpdir):
    df = alltypes_sample(size=10000)

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
    pq.write_table(arrow_table, filename.strpath, version="2.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
Example #10
0
def _write_partition_pyarrow(df, open_with, filename, write_index,
                             metadata_path=None, **kwargs):
    import pyarrow as pa
    from pyarrow import parquet
    t = pa.Table.from_pandas(df, preserve_index=write_index)

    with open_with(filename, 'wb') as fil:
        parquet.write_table(t, fil, **kwargs)

    if metadata_path is not None:
        with open_with(metadata_path, 'wb') as fil:
            kwargs.pop('compression', None)
            parquet.write_metadata(t.schema, fil, **kwargs)
def read_parquet(fn):
  """ read parquet file with Spark """
  print("Loading parquest file: %s..."% fn)
  file_name = 'parquet_sample.dat'
  read_parquest(file_name)
  fn = 'sample.parquet'
  tbl = pq.read_table(fn)
  df = tbl.to_pandas()
  d=df.iloc[:, 0:3]

  table = pa.Table.from_pandas(d)
  pq.write_table(table, 'example.parquet')

  pass
Example #12
0
def test_read_no_metadata(tmpdir, engine):
    # use pyarrow.parquet to create a parquet file without
    # pandas metadata
    pa = pytest.importorskip("pyarrow")
    import pyarrow.parquet as pq
    tmp = str(tmpdir) + "table.parq"

    table = pa.Table.from_arrays([pa.array([1, 2, 3]),
                                  pa.array([3, 4, 5])],
                                 names=['A', 'B'])
    pq.write_table(table, tmp)
    result = dd.read_parquet(tmp, engine=engine)
    expected = pd.DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]})
    assert_eq(result, expected)
Example #13
0
def parquet(tmpdir, data):
    pa = pytest.importorskip('pyarrow')
    import pyarrow.parquet as pq  # noqa: E402
    from ibis.file.parquet import ParquetClient

    # create single files
    d = tmpdir.mkdir('pq')

    for k, v in data.items():
        f = d / '{}.parquet'.format(k)
        table = pa.Table.from_pandas(v)
        pq.write_table(table, str(f))

    return ParquetClient(tmpdir).database()
Example #14
0
def test_client(tmpdir, data):

    # construct with a path to a file
    d = tmpdir / 'pq'
    d.mkdir()

    for k, v in data.items():
        f = d / "{}.parquet".format(k)
        table = pa.Table.from_pandas(v)
        pq.write_table(table, str(f))

    c = ParquetClient(tmpdir)
    assert c.list_databases() == ['pq']
    assert c.database().pq.list_tables() == ['close', 'open']
Example #15
0
def test_pandas_column_selection(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16)
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df)
    pq.write_table(arrow_table, filename.strpath)
    table_read = pq.read_table(filename.strpath, columns=['uint8'])
    df_read = table_read.to_pandas()

    pdt.assert_frame_equal(df[['uint8']], df_read)
Example #16
0
def test_fastparquet_read_with_hdfs():
    fs = hdfs_test_client()

    df = tm.makeDataFrame()
    table = pa.Table.from_pandas(df)

    path = '/tmp/testing.parquet'
    with fs.open(path, 'wb') as f:
        pq.write_table(table, f)

    parquet_file = fastparquet.ParquetFile(path, open_with=fs.open)

    result = parquet_file.to_pandas()
    tm.assert_frame_equal(result, df)
Example #17
0
def test_min_chunksize():
    data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
    table = pa.Table.from_pandas(data.reset_index())

    buf = io.BytesIO()
    pq.write_table(table, buf, chunk_size=-1)

    buf.seek(0)
    result = pq.read_table(buf)

    assert result.equals(table)

    with pytest.raises(ValueError):
        pq.write_table(table, buf, chunk_size=0)
Example #18
0
def test_multithreaded_read():
    df = alltypes_sample(size=10000)

    table = pa.Table.from_pandas(df, timestamps_to_ms=True)

    buf = io.BytesIO()
    pq.write_table(table, buf, compression='SNAPPY', version='2.0')

    buf.seek(0)
    table1 = pq.read_table(buf, nthreads=4)

    buf.seek(0)
    table2 = pq.read_table(buf, nthreads=1)

    assert table1.equals(table2)
Example #19
0
def test_single_pylist_column_roundtrip(tmpdir):
    for dtype in [int, float]:
        filename = tmpdir.join('single_{}_column.parquet'
                               .format(dtype.__name__))
        data = [pa.from_pylist(list(map(dtype, range(5))))]
        table = pa.Table.from_arrays(data, names=('a', 'b'), name='table_name')
        pq.write_table(table, filename.strpath)
        table_read = pq.read_table(filename.strpath)
        for col_written, col_read in zip(table.itercolumns(),
                                         table_read.itercolumns()):
            assert col_written.name == col_read.name
            assert col_read.data.num_chunks == 1
            data_written = col_written.data.chunk(0)
            data_read = col_read.data.chunk(0)
            assert data_written.equals(data_read)
Example #20
0
def test_pass_separate_metadata():
    # ARROW-471
    df = alltypes_sample(size=10000)

    a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)

    buf = io.BytesIO()
    pq.write_table(a_table, buf, compression='snappy', version='2.0')

    buf.seek(0)
    metadata = pq.ParquetFile(buf).metadata

    buf.seek(0)

    fileh = pq.ParquetFile(buf, metadata=metadata)

    pdt.assert_frame_equal(df, fileh.read().to_pandas())
Example #21
0
def parquet(tables, data_directory, ignore_missing_dependency, **params):
    try:
        import pyarrow as pa  # noqa: F401
        import pyarrow.parquet as pq  # noqa: F401
    except ImportError:
        msg = 'PyArrow dependency is missing'
        if ignore_missing_dependency:
            logger.warning('Ignored: %s', msg)
            return 0
        else:
            raise click.ClickException(msg)

    data_directory = Path(data_directory)
    for table, df in read_tables(tables, data_directory):
        arrow_table = pa.Table.from_pandas(df)
        target_path = data_directory / '{}.parquet'.format(table)
        pq.write_table(arrow_table, str(target_path))
Example #22
0
    def test_read_write_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid())
        self.hdfs.mkdir(tmpdir)
        path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet'))

        size = 5
        df = test_parquet._test_dataframe(size, seed=0)
        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)
        table = pa.Table.from_pandas(df, preserve_index=False)

        pq.write_table(table, path)

        result = pq.read_table(path).to_pandas()

        pdt.assert_frame_equal(result, df)
Example #23
0
def _write_partition_pyarrow(df, open_with, path, fs, filename, write_index,
                             partition_on, metadata_path=None, **kwargs):
    import pyarrow as pa
    from pyarrow import parquet
    t = pa.Table.from_pandas(df, preserve_index=write_index)

    if partition_on:
        parquet.write_to_dataset(t, path, partition_cols=partition_on, filesystem=fs)
    else:
        with open_with(filename, 'wb') as fil:
            parquet.write_table(t, fil, **kwargs)

    if metadata_path is not None:
        with open_with(metadata_path, 'wb') as fil:
            # Get only arguments specified in the function
            kwargs_meta = {k: v for k, v in kwargs.items()
                           if k in _pyarrow_write_metadata_kwargs}
            parquet.write_metadata(t.schema, fil, **kwargs_meta)
Example #24
0
def json_to_parquet(data, output, schema):
    column_data = {}
    array_data = []

    for row in data:
        for column in schema.names:
            _col = column_data.get(column, [])
            _col.append(row.get(column))
            column_data[column] = _col

    for column in schema:
        _col = column_data.get(column.name)
        if isinstance(column.type, pa.lib.TimestampType):
            _converted_col = []
            for t in _col:
                try:
                    _converted_col.append(pd.to_datetime(t))
                except pd._libs.tslib.OutOfBoundsDatetime:
                    _converted_col.append(pd.Timestamp.max)
            array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ms')))
        # Float types are ambiguous for conversions, need to specify the exact type
        elif column.type.id == pa.float64().id:
            array_data.append(pa.array(_col, type=pa.float64()))
        elif column.type.id == pa.float32().id:
            # Python doesn't have a native float32 type
            # and PyArrow cannot cast float64 -> float32
            _col = pd.to_numeric(_col, downcast='float')
            array_data.append(pa.Array.from_pandas(_col, type=pa.float32()))
        elif column.type.id == pa.int64().id:
            array_data.append(pa.array([int(ele) for ele in _col], type=pa.int64()))
        else:
            array_data.append(pa.array(_col, type=column.type))

    data = pa.RecordBatch.from_arrays(array_data, schema.names)

    try:
        table = pa.Table.from_batches(data)
    except TypeError:
        table = pa.Table.from_batches([data])

    pq.write_table(table, output, compression='SNAPPY', coerce_timestamps='ms')
Example #25
0
def test_direct_read_dictionary(use_legacy_dataset):
    # ARROW-3325
    repeats = 10
    nunique = 5

    data = [
        [util.rands(10) for i in range(nunique)] * repeats,
    ]
    table = pa.table(data, names=['f0'])

    bio = pa.BufferOutputStream()
    pq.write_table(table, bio)
    contents = bio.getvalue()

    result = pq.read_table(pa.BufferReader(contents),
                           read_dictionary=['f0'],
                           use_legacy_dataset=use_legacy_dataset)

    # Compute dictionary-encoded subfield
    expected = pa.table([table[0].dictionary_encode()], names=['f0'])
    assert result.equals(expected)
def read_csv_write_to_parquet(local_data_path, s3_path, local_meta_path):

    if s3_path.startswith("s3://"):
        s3_path = s3_path.replace("s3://", "", 1)

    local = fs.LocalFileSystem()
    s3 = fs.S3FileSystem(region=REGION)
    with local.open_input_stream(local_data_path) as f:
        tab = csv.read_csv(f)

    metadata = read_table_json(local_meta_path)
    arrow_cols = []
    for col in metadata.columns:
        if col["name"] not in metadata.partitions:
            arrow_cols.append(convert_meta_col_to_arrow_tuple(col))

    s = pa.schema(arrow_cols)
    tab = tab.cast(s)

    with s3.open_output_stream(s3_path) as f:
        pq.write_table(tab, f)
Example #27
0
def test_pandas_parquet_pyfile_roundtrip(tmpdir):
    filename = tmpdir.join('pandas_pyfile_roundtrip.parquet').strpath
    size = 5
    df = pd.DataFrame({
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'strings': ['foo', 'bar', None, 'baz', 'qux']
    })

    arrow_table = pa.Table.from_pandas(df)

    with open(filename, 'wb') as f:
        pq.write_table(arrow_table, f, version="1.0")

    data = io.BytesIO(open(filename, 'rb').read())

    table_read = pq.read_table(data)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
Example #28
0
    def _write_data(self,
                    directory=None,
                    schema=None,
                    prefix=tempfile.template,
                    row_group_size=1000,
                    codec='none',
                    count=None):
        if directory is None:
            directory = self.temp_dir

        with tempfile.NamedTemporaryFile(delete=False,
                                         dir=directory,
                                         prefix=prefix) as f:
            table = self._records_as_arrow(schema, count)
            pq.write_table(table,
                           f,
                           row_group_size=row_group_size,
                           compression=codec,
                           use_deprecated_int96_timestamps=True)

            return f.name
Example #29
0
def getdata(year: int):
    """
    A helper function to retrieve data and save it locally as parquet file.

    Args:
        Year which should be retrieved.

    Returns:
        saves for each month a parquet file on local drive.
    """

    for month in range(1, 13):
        if month < 10:
            linkurl = f"http://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_{year}-0{month}.csv"
        elif month >= 10:
            linkurl = f"http://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_{year}-{month}.csv"

        df = pd.read_csv(linkurl, sep=',')

        table = pa.Table.from_pandas(df)
        pq.write_table(table, f"../avgdrive/nyc_yellow{year}-{month}.parquet")
Example #30
0
def test_compat_old_rw_path(df_all_types, store):
    # strip down DF before some column types weren't supported before anyway
    df = df_all_types[
        [
            c
            for c in df_all_types.columns
            if (
                not c.startswith("array_")  # array types (always null)
                and c != "unicode"  # unicode type (alway null)
                and "8" not in c  # 8 bit types are casted to 64 bit
                and "16" not in c  # 16 bit types are casted to 64 bit
                and "32" not in c  # 32 bit types are casted to 64 bit
            )
        ]
    ]
    expected_meta = make_meta(df, origin="df")

    # old schema write path
    old_meta = dask_make_meta(df)
    pa_table = pa.Table.from_pandas(old_meta)
    buf = pa.BufferOutputStream()
    pq.write_table(pa_table, buf, version="2.0")
    key_old = _get_common_metadata_key("dataset_uuid_old", "table")
    store.put(key_old, buf.getvalue().to_pybytes())

    actual_meta = read_schema_metadata(
        dataset_uuid="dataset_uuid_old", store=store, table="table"
    )
    validate_compatible([actual_meta, expected_meta])

    store_schema_metadata(
        schema=make_meta(df, origin="df"),
        dataset_uuid="dataset_uuid_new",
        store=store,
        table="table",
    )
    key_new = _get_common_metadata_key("dataset_uuid_new", "table")
    actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store)
    actual_df["date"] = actual_df["date"].dt.date
    pdt.assert_frame_equal(actual_df, old_meta)
def test_multi_dataset_metadata(tempdir):
    filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"]
    metapath = str(tempdir / "_metadata")

    # create a test dataset
    df = pd.DataFrame({
        'one': [1, 2, 3],
        'two': [-1, -2, -3],
        'three': [[1, 2], [2, 3], [3, 4]],
    })
    table = pa.Table.from_pandas(df)

    # write dataset twice and collect/merge metadata
    _meta = None
    for filename in filenames:
        meta = []
        pq.write_table(table, str(tempdir / filename),
                       metadata_collector=meta)
        meta[0].set_file_path(filename)
        if _meta is None:
            _meta = meta[0]
        else:
            _meta.append_row_groups(meta[0])

    # Write merged metadata-only file
    with open(metapath, "wb") as f:
        _meta.write_metadata_file(f)

    # Read back the metadata
    meta = pq.read_metadata(metapath)
    md = meta.to_dict()
    _md = _meta.to_dict()
    for key in _md:
        if key != 'serialized_size':
            assert _md[key] == md[key]
    assert _md['num_columns'] == 3
    assert _md['num_rows'] == 6
    assert _md['num_row_groups'] == 2
    assert _md['serialized_size'] == 0
    assert md['serialized_size'] > 0
Example #32
0
    def save_dataframe(self, dataframe):
        """
        Save a DataFrame to the store.
        """
        storepath = self.temporary_object_path(str(uuid.uuid4()))

        # switch parquet lib
        parqlib = self.get_parquet_lib()
        if isinstance(dataframe, pd.DataFrame):
            #parqlib is ParquetLib.ARROW: # other parquet libs are deprecated, remove?
            import pyarrow as pa
            from pyarrow import parquet
            table = pa.Table.from_pandas(dataframe)
            parquet.write_table(table, storepath)
        elif parqlib is ParquetLib.SPARK:
            from pyspark import sql as sparksql
            assert isinstance(dataframe, sparksql.DataFrame)
            dataframe.write.parquet(storepath)
        else:
            assert False, "Unimplemented ParquetLib %s" % parqlib

        # Move serialized DataFrame to object store
        if os.path.isdir(storepath):  # Pyspark
            hashes = []
            files = [
                ofile for ofile in os.listdir(storepath)
                if ofile.endswith(".parquet")
            ]
            for obj in files:
                path = os.path.join(storepath, obj)
                objhash = digest_file(path)
                move(path, self.object_path(objhash))
                hashes.append(objhash)
            rmtree(storepath)
        else:
            filehash = digest_file(storepath)
            move(storepath, self.object_path(filehash))
            hashes = [filehash]

        return hashes
Example #33
0
    def prepare():
        print("Download titles....")
        imdb_title = pd.read_csv(IMDB_TITLE_GZIP, sep='\t', dtype='str', index_col='tconst', engine='c')
        imdb_title = imdb_title[imdb_title['titleType']=='movie']
        imdb_title = imdb_title.dropna(subset=['startYear', 'originalTitle'])

        print("Download ratings....")
        table = pa.Table.from_pandas(pd.merge(
            imdb_title,
            pd.read_csv(IMDB_RATING_GZIP, sep='\t', dtype='str', index_col='tconst', engine='c'),
            how='left',
            left_index=True,
            right_index=True, sort=False), preserve_index=True)
        pq.write_table(table, IMDB_MOVIES_PARQUET, compression='gzip')

        print("Download actors....")
        imdb_actors = pd.read_csv(IMDB_ACTORS_GZIP, sep='\t', dtype='str', index_col='tconst', engine='c')
        imdb_actors = imdb_actors[(imdb_actors["ordering"] == '1') & (
                    (imdb_actors["category"] == 'actor') | (imdb_actors["category"] == 'actress'))]
        imdb_actors_names = pd.read_csv(IMDB_ACTORS_NAMES_GZIP, sep='\t', dtype='str', index_col='nconst', engine='c')
        imdb_actors_with_names = imdb_actors.merge(imdb_actors_names, right_index=True, left_on="nconst")
        imdb_actors_with_names = imdb_actors_with_names[["primaryName", "characters"]]
        pa_actors = pa.Table.from_pandas(imdb_actors_with_names)
        pq.write_table(pa_actors, IMDB_ACTORS_PARQUET, compression='gzip')

        print("Download covers....")
        table = pa.Table.from_pandas(pd.read_csv(IMDB_COVERS_CSV), preserve_index=False)
        pq.write_table(table, IMDB_COVERS_PARQUET, compression='gzip')
Example #34
0
def main():
    f1 = '../data/e024b429-3fb1-4a6d-b4e6-23fe5eaadfc5'
    f2 = '../data/468cd686-0b96-4296-92ff-45f46c73b90e'

    fp1 = 'dataset1.parquet'
    fp2 = 'dataset2.parquet'
    fp3 = 'dataset3.parquet'

    ds1 = xr.open_dataset(f1)
    pq.write_table(pa.Table.from_pandas(ds1.to_dataframe()), fp1)

    ds2 = xr.open_dataset(f2)
    pq.write_table(pa.Table.from_pandas(ds2.to_dataframe()), fp2)

    ds = ds1.merge(ds2)
    ds.to_netcdf("dataset3.nc")

    # dask required TODO...
    # with xr.open_mfdataset('../data/*') as ds:
    #     print(ds.keys())

    df = ds.to_dataframe()
    table = pa.Table.from_pandas(df)
    print(table.to_pandas())

    pq.write_table(table, fp3)

    for f in [f1, fp1, f2, fp2, fp3]:
        print("{}           {} MB".format(f, size_mb(os.path.getsize(f))))
Example #35
0
 def merge_non_audio_summaries(self):
     """
     combines and replaces all summaries per type except for audio summaries
     """
     smrs_dict = {}
     for smry in self.summaries:
         if smry.stype != SensorType.AUDIO:
             if smry.stype in smrs_dict.keys():
                 smrs_dict[smry.stype].append(smry)
             else:
                 smrs_dict[smry.stype] = [smry]
     self.summaries = self.get_audio()
     for styp, smrys in smrs_dict.items():
         first_summary = smrys.pop(0)
         tbl = first_summary.data()
         combined_mint = np.mean([smrs.smint_s for smrs in smrys])
         combined_std = np.mean([smrs.sstd_s for smrs in smrys])
         if not first_summary.check_data():
             os.makedirs(first_summary.fdir, exist_ok=True)
         for smrs in smrys:
             tbl = pa.concat_tables([tbl, smrs.data()])
             if not first_summary.check_data():
                 os.remove(smrs.file_name())
         if first_summary.check_data():
             first_summary._data = tbl
         else:
             pq.write_table(tbl, first_summary.file_name())
         mnint = dtu.microseconds_to_seconds(float(np.mean(np.diff(tbl["timestamps"].to_numpy()))))
         stdint = dtu.microseconds_to_seconds(float(np.std(np.diff(tbl["timestamps"].to_numpy()))))
         if not combined_mint + combined_std > mnint > combined_mint - combined_std:
             self.errors.append(f"Mean interval s of combined {styp.name} sensor does not match the "
                                f"compilation of individual mean interval s per packet.  Will use compilation of "
                                f"individual values.")
             mnint = combined_mint
             stdint = combined_std
         single_smry = PyarrowSummary(first_summary.name, styp, first_summary.start,
                                      1 / mnint, first_summary.fdir, tbl.num_rows, mnint, stdint,
                                      first_summary.data() if first_summary.check_data() else None
                                      )
         self.summaries.append(single_smry)
Example #36
0
    def save_data(self,
                  data,
                  format='parquet',
                  resolution='time',
                  errors=False):
        """fn: to save data to directory

        # Args
            data : pd.DataFrame
            format : str, ('parquet', 'h5', 'csv', 'feather')
            resolution : str, date or time
                if date uses default str format,
                if time will use YYYY-MM-DD_HH.MM.SS
            errors : bool,
                if True change filepath name
                if False use options data filepath name
        """
        _dir = self._create_dir()

        if resolution == 'time':
            _timestamp = self.__create_timestamp_str()
        elif resolution == 'date':
            _timestamp = self.__create_date_str()

        if errors:
            _fp = _dir + f'yahoo_options_scraper_errors_{_timestamp}.{format}'
        else:
            _fp = _dir + f'yahoo_options_data_{_timestamp}.{format}'

        if format == 'parquet':
            _table = pa.Table.from_pandas(data)
            pq.write_table(_table, _fp)

        elif format == 'h5':
            data.to_hdf(_fp, key='data')
        elif format == 'csv':
            data.to_csv(_fp, index=False)
        elif format == 'feather':
            data.to_feather(_fp)
        return
Example #37
0
    def test_use_nullable_dtypes(self, engine):
        import pyarrow.parquet as pq

        if engine == "fastparquet":
            # We are manually disabling fastparquet's
            # nullable dtype support pending discussion
            pytest.skip("Fastparquet nullable dtype support is disabled")

        table = pyarrow.table(
            {
                "a": pyarrow.array([1, 2, 3, None], "int64"),
                "b": pyarrow.array([1, 2, 3, None], "uint8"),
                "c": pyarrow.array(["a", "b", "c", None]),
                "d": pyarrow.array([True, False, True, None]),
                # Test that nullable dtypes used even in absence of nulls
                "e": pyarrow.array([1, 2, 3, 4], "int64"),
            }
        )
        with tm.ensure_clean() as path:
            # write manually with pyarrow to write integers
            pq.write_table(table, path)
            result1 = read_parquet(path, engine=engine)
            result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True)

        assert result1["a"].dtype == np.dtype("float64")
        expected = pd.DataFrame(
            {
                "a": pd.array([1, 2, 3, None], dtype="Int64"),
                "b": pd.array([1, 2, 3, None], dtype="UInt8"),
                "c": pd.array(["a", "b", "c", None], dtype="string"),
                "d": pd.array([True, False, True, None], dtype="boolean"),
                "e": pd.array([1, 2, 3, 4], dtype="Int64"),
            }
        )
        if engine == "fastparquet":
            # Fastparquet doesn't support string columns yet
            # Only int and boolean
            result2 = result2.drop("c", axis=1)
            expected = expected.drop("c", axis=1)
        tm.assert_frame_equal(result2, expected)
Example #38
0
def combine_features(metadata_filename: str) -> None:
    """
    Combine feature files for multiple datasets into a single feature file.

    If the combined feature file already exists it will _not_ be recreated.

    Parameters
    ----------
    metadata_filename : str
        Features for all datasets included in the metadata will be combined.
        Should be a Parquet file.
    """
    feat_dir = os.path.join(os.environ['GLEAMS_HOME'], 'data', 'feature')
    feat_filename = os.path.join(
        feat_dir,
        os.path.splitext(os.path.basename(metadata_filename))[0].replace(
            'metadata', 'feature'))
    if (os.path.isfile(f'{feat_filename}.npz')
            and os.path.isfile(f'{feat_filename}.parquet')):
        return
    datasets = pd.read_parquet(metadata_filename,
                               columns=['dataset'])['dataset'].unique()
    logger.info('Combine features for metadata file %s containing %d datasets',
                metadata_filename, len(datasets))
    encodings, indexes = [], []
    for i, dataset in enumerate(datasets, 1):
        logger.debug('Append dataset %s [%3d/%3d]', dataset, i, len(datasets))
        dataset_encodings_filename = os.path.join(feat_dir, 'dataset',
                                                  f'{dataset}.npz')
        dataset_index_filename = os.path.join(feat_dir, 'dataset',
                                              f'{dataset}.parquet')
        if (not os.path.isfile(dataset_encodings_filename)
                or not os.path.isfile(dataset_index_filename)):
            logger.warning('Missing features for dataset %s, skipping...',
                           dataset)
        else:
            encodings.append(ss.load_npz(dataset_encodings_filename))
            indexes.append(pq.read_table(dataset_index_filename))
    ss.save_npz(f'{feat_filename}.npz', ss.vstack(encodings, 'csr'))
    pq.write_table(pa.concat_tables(indexes), f'{feat_filename}.parquet')
Example #39
0
    def stream_csv(self, in_io):
        parsed_rows = 0
        out = io.StringIO()
        out_parquet = io.BytesIO()
        header_rows = self.header_fields.keys()
        df_data = list()
        writer = csv.writer(out, delimiter=',')
        writer.writerow(header_rows)
        lines = in_io.decode('utf-8').split('\n')
        logging.info("got {} lines to parse".format(len(lines)))
        for line_num, line in enumerate(lines):
            if not self.is_valid_format(line, line_num):
                continue

            result, uuid = self.json_to_csv(self.extract_json(line, line_num))
            if uuid in self.uuids:
                continue

            self.uuids.add(uuid)
            writer.writerow(result)
            df_data.append(result)
            parsed_rows += 1
        df = pd.DataFrame(df_data, columns=header_rows)

        # Pyarrow tries to infer types by default.
        # Explicitly set the types to prevent mis-typing.
        df = self.apply_df_types(df)

        # Convert pandas.DataFrame -> pyarrow.Table (Parquet)
        table = pa.Table.from_pandas(df)

        # Write parquet table.
        pq.write_table(table, out_parquet, compression='snappy')

        # Reset all FP's
        out_parquet.seek(0)
        out.seek(0)

        total_rows = len(lines)
        return parsed_rows, total_rows, out, out_parquet
Example #40
0
def test_pandas_parquet_1_0_rountrip(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8':
        np.arange(size, dtype=np.uint8),
        'uint16':
        np.arange(size, dtype=np.uint16),
        'uint32':
        np.arange(size, dtype=np.uint32),
        'uint64':
        np.arange(size, dtype=np.uint64),
        'int8':
        np.arange(size, dtype=np.int16),
        'int16':
        np.arange(size, dtype=np.int16),
        'int32':
        np.arange(size, dtype=np.int32),
        'int64':
        np.arange(size, dtype=np.int64),
        'float32':
        np.arange(size, dtype=np.float32),
        'float64':
        np.arange(size, dtype=np.float64),
        'bool':
        np.random.randn(size) > 0,
        'str': [str(x) for x in range(size)],
        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
        'empty_str': [''] * size
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df)
    pq.write_table(arrow_table, filename.strpath, version="1.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()

    # We pass uint32_t as int64_t if we write Parquet version 1.0
    df['uint32'] = df['uint32'].values.astype(np.int64)

    pdt.assert_frame_equal(df, df_read)
def test_write_metadata(tempdir):
    path = str(tempdir / "metadata")
    schema = pa.schema([("a", "int64"), ("b", "float64")])

    # write a pyarrow schema
    pq.write_metadata(schema, path)
    parquet_meta = pq.read_metadata(path)
    schema_as_arrow = parquet_meta.schema.to_arrow_schema()
    assert schema_as_arrow.equals(schema)

    # ARROW-8980: Check that the ARROW:schema metadata key was removed
    if schema_as_arrow.metadata:
        assert b'ARROW:schema' not in schema_as_arrow.metadata

    # pass through writer keyword arguments
    for version in ["1.0", "2.0", "2.4", "2.6"]:
        pq.write_metadata(schema, path, version=version)
        parquet_meta = pq.read_metadata(path)
        # The version is stored as a single integer in the Parquet metadata,
        # so it cannot correctly express dotted format versions
        expected_version = "1.0" if version == "1.0" else "2.6"
        assert parquet_meta.format_version == expected_version

    # metadata_collector: list of FileMetaData objects
    table = pa.table({'a': [1, 2], 'b': [.1, .2]}, schema=schema)
    pq.write_table(table, tempdir / "data.parquet")
    parquet_meta = pq.read_metadata(str(tempdir / "data.parquet"))
    pq.write_metadata(
        schema, path, metadata_collector=[parquet_meta, parquet_meta]
    )
    parquet_meta_mult = pq.read_metadata(path)
    assert parquet_meta_mult.num_row_groups == 2

    # append metadata with different schema raises an error
    with pytest.raises(RuntimeError, match="requires equal schemas"):
        pq.write_metadata(
            pa.schema([("a", "int32"), ("b", "null")]),
            path, metadata_collector=[parquet_meta, parquet_meta]
        )
Example #42
0
def apply(df, path, parameters=None):
    """
    Exports a dataframe to a Parquet file

    Parameters
    ------------
    df
        Dataframe
    path
        Path
    parameters
        Possible parameters of the algorithm
    """
    if parameters is None:
        parameters = {}

    compression = parameters[
        "compression"] if "compression" in parameters else "snappy"

    df.columns = [x.replace(":", "AAA") for x in df.columns]
    df = pa.Table.from_pandas(df)
    pq.write_table(df, path, compression=compression)
Example #43
0
    def write_pandas(
        self,
        df: pd.DataFrame,
        compression="snappy",
        num_chunks: int = None,
        chunk_size: int = None,
        schema: pyarrow.Schema = None,
    ):
        """Write DataFrame as Parquet Dataset"""
        # Check arguments
        if not isinstance(self.path_or_paths, str):
            msg = f"Cannot write table to {self.path_or_paths} (expected string)"
            raise TypeError(msg)
        if num_chunks is not None and chunk_size is not None:
            msg = f"Both num_chunks and chunk_size are given, not allowed"
            raise ValueError(msg)
        if chunk_size is not None:
            num_chunks = max(len(df) // chunk_size, 1)

        # Write DataFrame to parquet
        if num_chunks is None:
            table = pyarrow.Table.from_pandas(df,
                                              schema=schema,
                                              preserve_index=False)
            self.write(table, compression=compression)
        else:
            Path(self.path_or_paths).mkdir(parents=True,
                                           exist_ok=True,
                                           filesystem=self.filesystem)
            chunks = np.array_split(df, num_chunks)
            for idx, chunk in enumerate(chunks):
                filename = f"part-{idx:05d}.parquet.{compression}"
                chunk_path = Path(self.path_or_paths, filename)
                LOGGER.info(f"Writing chunk:{idx} to {chunk_path}")
                with chunk_path.open("wb", filesystem=self.filesystem) as file:
                    table = pyarrow.Table.from_pandas(chunk,
                                                      schema=schema,
                                                      preserve_index=False)
                    pq.write_table(table, file, compression=compression)
def local_to_parquet():
    files = [f for f in listdir(path) if isfile(join(path, f))]
    for file in files:
        df = pd.read_csv(path + '\\' + file)
        df[df['text_type'].isin(['post', 'comment'
                                 ])]  #only keep rows with correct formatting
        if (len(df.columns) > 7):
            print("wait")
        df.drop(
            df.iloc[:, 7:], inplace=True, axis=1
        )  #drop columns created due to wrong formatting #TBD with parquet
        df = df.fillna('')  #change NaNs to empty string
        df['tickers'] = df['tickers'].apply(tickers_to_list)
        df['post_id'] = df['post_id'].astype(str)
        df['comment_id'] = df['comment_id'].astype(str)
        df['subreddit'] = df['subreddit'].astype(str)
        df['text_type'] = df['text_type'].astype(str)
        filename = file.replace(".csv", '')
        path_tsv = path_parquet + filename + ".parquet"
        print(df.head())
        table_pa = pa.Table.from_pandas(df)
        pq.write_table(table_pa, path_tsv)
Example #45
0
def load_raw():
    # note manually removed some bad row
    kwargs = get_pandas_read_csv_defaults()
    kwargs['thousands'] = ',' # always do this
    kwargs['parse_dates'] = ['Date']
    kwargs['na_values'] = ['-']
    kwargs['dtype'] = 'str'
    dtype = {
     'Close': 'float',
     'High': 'float',
     'Low': 'float',
     'Market Cap': 'float',
     'Open': 'float',
     'Volume': 'float'
     }

    meta = pd.read_csv(os.path.join(_mydir, 'Top100Cryptos/data/100 List.csv'))
    names = meta.Name.tolist()
    files = [os.path.join(_mydir, 'Top100Cryptos/data/{}.csv'.format(x)) for x in names]
    # files = glob.glob(os.path.join(_mydir, 'Top100Cryptos/data/*.csv'))
    dfs = list()
    datadir = os.path.join(_mydir, 'parsed')
    if not os.path.exists(datadir):
        os.makedirs(datadir)
    for i, (name, f) in enumerate(zip(names, files)):
        mtime = os.path.getmtime(f)
        dirname = os.path.join(datadir, 'name={}/mtime={}'.format(name, mtime))
        filename = os.path.join(dirname, 'data.parquet')
        if not os.path.exists(filename):
            df = pd.read_csv(f, **kwargs)
            df = pa.Table.from_pandas(df)
            if not os.path.exists(dirname):
                os.makedirs(dirname)
            print('writing {}'.format(filename))
            pq.write_table(df, filename)
            pq.read_table('./parsed') # test
        else:
            print('{} exists'.format(filename))
    return pq.read_table('./parsed') # test
Example #46
0
    def write_parquet(self):
        # TODO: Test and create docs for
        import pyarrow as pa
        import pyarrow.parquet as pq

        # Generate the schema
        field_list = []
        # TODO: Catch and exit gracefully
        # Will fail if the extractor does not have the class var schema_fields
        for k, v in self.schema_fields.items():
            field_list.append(pa.field(k, v))

        schema = pa.schema(field_list)

        # Create pyarrow table
        column_names = []
        columns = []
        for column in schema:
            column_values = [dic.get(column.name) for dic in self.data]
            try:
                columns.append(pa.array(column_values, type=column.type))
            except Exception:
                logger.exception(("Could not create array"
                                  f" for column: {column.name}"))
                raise
            column_names.append(column.name)

        record_batch = pa.RecordBatch.from_arrays(columns, column_names)
        table = pa.Table.from_batches([record_batch])

        output_io = io.BytesIO()
        pq.write_table(table, output_io)
        output_io.seek(0)

        # TODO: Is this the correct content type for a parquet file?
        return SaveTo(self.scraper,
                      output_io,
                      content_type='application/octet-stream',
                      encoding=self.encoding)
Example #47
0
def df_to_parquet(df, filename, compression='SNAPPY'):
    """write_to_parquet: Converts a Pandas DataFrame into a Parquet file
        Args:
            df (pandas dataframe): The Pandas Dataframe to be saved as parquet file
            filename (string): The full path to the filename for the Parquet file
    """

    # Right now there are two open Parquet issues
    # Timestamps in Spark: https://issues.apache.org/jira/browse/ARROW-1499
    # TimeDelta Support: https://issues.apache.org/jira/browse/ARROW-835
    for column in df.columns:
        if (df[column].dtype == 'timedelta64[ns]'):
            print('Converting timedelta column {:s}...'.format(column))
            df[column] = df[column].astype(str)

    arrow_table = pa.Table.from_pandas(df)
    if compression == 'UNCOMPRESSED':
        compression = None
    pq.write_table(arrow_table,
                   filename,
                   compression=compression,
                   use_deprecated_int96_timestamps=True)
Example #48
0
def _write_partition_pyarrow(df,
                             open_with,
                             filename,
                             write_index,
                             metadata_path=None,
                             **kwargs):
    import pyarrow as pa
    from pyarrow import parquet
    t = pa.Table.from_pandas(df, preserve_index=write_index)

    with open_with(filename, 'wb') as fil:
        parquet.write_table(t, fil, **kwargs)

    if metadata_path is not None:
        with open_with(metadata_path, 'wb') as fil:
            # Get only arguments specified in the function
            kwargs_meta = {
                k: v
                for k, v in kwargs.items()
                if k in _pyarrow_write_metadata_kwargs
            }
            parquet.write_metadata(t.schema, fil, **kwargs_meta)
Example #49
0
def test_metadata_exceeds_message_size():
    # ARROW-13655: Thrift may enable a default message size that limits
    # the size of Parquet metadata that can be written.
    NCOLS = 1000
    NREPEATS = 4000

    table = pa.table({str(i): np.random.randn(10) for i in range(NCOLS)})

    with pa.BufferOutputStream() as out:
        pq.write_table(table, out)
        buf = out.getvalue()

    original_metadata = pq.read_metadata(pa.BufferReader(buf))
    metadata = pq.read_metadata(pa.BufferReader(buf))
    for i in range(NREPEATS):
        metadata.append_row_groups(original_metadata)

    with pa.BufferOutputStream() as out:
        metadata.write_metadata_file(out)
        buf = out.getvalue()

    metadata = pq.read_metadata(pa.BufferReader(buf))
 def fixColumnNames(pdf):
     """
     Copy files from cc3 folder to temporary location for fixing column names
     :return:
     """
     try:
         hdfs = pa.hdfs.connect("dantooine10dot", 8020)
         old_folder_name = "/cc3/"
         new_folder_name = "/fixed_column_names/"
         user_folder = pdf["user_folder"].iloc[0]
         new_location = user_folder.replace(old_folder_name,
                                            new_folder_name)
         hdfs.mkdir(new_location)
         for index, row in pdf.iterrows():
             file_name = "hdfs://dantooine10dot:8020" + row["file_name"]
             if hdfs.exists(row["file_name"].replace(
                     old_folder_name, new_folder_name)):
                 #print("ALREADY PROCESSED - ",row["file_name"].replace(old_folder_name,new_folder_name))
                 pass
             else:
                 data = pq.read_table(file_name)
                 try:
                     data2 = data.drop(["__index_level_0__"])
                 except:
                     data2 = data
                 if isinstance(row["corrected_schema"], str):
                     new_column_names = eval(row["corrected_schema"])
                 else:
                     new_column_names = row["corrected_schema"]
                 data3 = data2.rename_columns(new_column_names)
                 pq.write_table(
                     data3,
                     file_name.replace(old_folder_name, new_folder_name))
         return pd.DataFrame([[user_folder, 1]],
                             columns=['user_folder', 'success'])
     except Exception as e:
         print("*" * 10, user_folder, str(e))
         return pd.DataFrame([[user_folder, 0]],
                             columns=['user_folder', 'success'])
Example #51
0
def test_parquet_nested_storage(tmpdir):
    # Parquet support for extension types with nested storage type
    import pyarrow.parquet as pq

    struct_array = pa.StructArray.from_arrays(
        [pa.array([0, 1], type="int64"),
         pa.array([4, 5], type="int64")],
        names=["left", "right"])
    list_array = pa.array([[1, 2, 3], [4, 5]], type=pa.list_(pa.int32()))

    mystruct_array = pa.ExtensionArray.from_storage(MyStructType(),
                                                    struct_array)
    mylist_array = pa.ExtensionArray.from_storage(MyListType(), list_array)

    orig_table = pa.table({'structs': mystruct_array, 'lists': mylist_array})
    filename = tmpdir / 'nested_extension_storage.parquet'
    pq.write_table(orig_table, filename)

    table = pq.read_table(filename)
    assert table.column(0).type == mystruct_array.type
    assert table.column(1).type == mylist_array.type
    assert table == orig_table
Example #52
0
def test_noncoerced_nanoseconds_written_without_exception(tempdir):
    # ARROW-1957: the Parquet version 2.0 writer preserves Arrow
    # nanosecond timestamps by default
    n = 9
    df = pd.DataFrame({'x': range(n)},
                      index=pd.date_range('2017-01-01', freq='1n', periods=n))
    tb = pa.Table.from_pandas(df)

    filename = tempdir / 'written.parquet'
    try:
        pq.write_table(tb, filename, version='2.6')
    except Exception:
        pass
    assert filename.exists()

    recovered_table = pq.read_table(filename)
    assert tb.equals(recovered_table)

    # Loss of data through coercion (without explicit override) still an error
    filename = tempdir / 'not_written.parquet'
    with pytest.raises(ValueError):
        pq.write_table(tb, filename, coerce_timestamps='ms', version='2.6')
Example #53
0
def test_fastparquet_read_with_hdfs(client):
    try:
        import snappy  # noqa
    except ImportError:
        pytest.skip('fastparquet test requires snappy')

    import pyarrow.parquet as pq
    fastparquet = pytest.importorskip('fastparquet')

    fs = hdfs_test_client(client)

    df = pdt.makeDataFrame()
    table = pa.Table.from_pandas(df)

    path = '/tmp/testing.parquet'
    with fs.open(path, 'wb') as f:
        pq.write_table(table, f)

    parquet_file = fastparquet.ParquetFile(path, open_with=fs.open)

    result = parquet_file.to_pandas()
    pdt.assert_frame_equal(result, df)
Example #54
0
    def bench_write(self, niter=2):
        print("Reading text file: {}".format(self.csv_path))
        df = pd.read_csv(self.csv_path,
                         sep=self.sep,
                         header=self.header,
                         low_memory=False)
        if self.header is None:
            df.columns = ['f{}'.format(i) for i in range(len(df.columns))]

        def _get_table(df):
            return (pa.Table.from_pandas(
                df, preserve_index=False).replace_schema_metadata(None))

        t = _get_table(df)

        cases = [
            ('parquet (UNC)', 'arrow Table', lambda: pq.write_table(
                t, self.parquet_unc_path, compression='NONE')),
            ('parquet (UNC)', 'pandas', lambda: pq.write_table(
                _get_table(df), self.parquet_unc_path, compression='NONE')),
            ('parquet (SNAPPY)', 'arrow Table',
             lambda: pq.write_table(t, self.parquet_snappy_path)),
            ('parquet (SNAPPY)', 'pandas',
             lambda: pq.write_table(_get_table(df), self.parquet_snappy_path)),
            ('feather V2 (UNC)', 'pandas', lambda: feather.write_feather(
                df, self.feather_unc_path, compression='uncompressed')),
            ('feather V2 (UNC)', 'arrow Table', lambda: feather.write_feather(
                t, self.feather_unc_path, compression='uncompressed')),
            ('feather V2 (LZ4)', 'pandas', lambda: feather.write_feather(
                df, self.feather_lz4_path, compression='lz4')),
            ('feather V2 (LZ4)', 'arrow Table', lambda: feather.write_feather(
                t, self.feather_lz4_path, compression='lz4')),
            ('feather V2 (ZSTD)', 'pandas', lambda: feather.write_feather(
                df, self.feather_zstd_path, compression='zstd')),
            ('feather V2 (ZSTD)', 'arrow Table', lambda: feather.write_feather(
                t, self.feather_zstd_path, compression='zstd'))
        ]

        return self._bench_cases(cases, niter)
Example #55
0
def multisourcefs(request):
    request.config.pyarrow.requires('pandas')
    request.config.pyarrow.requires('parquet')
    import pyarrow.parquet as pq

    df = _generate_data(1000)
    mockfs = fs._MockFileSystem()

    # simply split the dataframe into three chunks to construct a data source
    # from each chunk into its own directory
    df_a, df_b, df_c, df_d = np.array_split(df, 4)

    # create a directory containing a flat sequence of parquet files without
    # any partitioning involved
    mockfs.create_dir('plain')
    for i, chunk in enumerate(np.array_split(df_a, 10)):
        path = 'plain/chunk-{}.parquet'.format(i)
        with mockfs.open_output_stream(path) as out:
            pq.write_table(_table_from_pandas(chunk), out)

    # create one with schema partitioning by week and color
    mockfs.create_dir('schema')
    for part, chunk in df_b.groupby([df_b.date.dt.week, df_b.color]):
        folder = 'schema/{}/{}'.format(*part)
        path = '{}/chunk.parquet'.format(folder)
        mockfs.create_dir(folder)
        with mockfs.open_output_stream(path) as out:
            pq.write_table(_table_from_pandas(chunk), out)

    # create one with hive partitioning by year and month
    mockfs.create_dir('hive')
    for part, chunk in df_c.groupby([df_c.date.dt.year, df_c.date.dt.month]):
        folder = 'hive/year={}/month={}'.format(*part)
        path = '{}/chunk.parquet'.format(folder)
        mockfs.create_dir(folder)
        with mockfs.open_output_stream(path) as out:
            pq.write_table(_table_from_pandas(chunk), out)

    # create one with hive partitioning by color
    mockfs.create_dir('hive_color')
    for part, chunk in df_d.groupby(["color"]):
        folder = 'hive_color/color={}'.format(*part)
        path = '{}/chunk.parquet'.format(folder)
        mockfs.create_dir(folder)
        with mockfs.open_output_stream(path) as out:
            pq.write_table(_table_from_pandas(chunk), out)

    return mockfs
Example #56
0
def execute_perm_imp(respseptup):
    """
    Standalone function writes the importance of this combination 
    of responseagg and separation to a subdirectory
    no returns
    """
    responseagg, separation = respseptup
    retpath = OUTDIR / str(responseagg) / str(separation)
    if not retpath.exists():
        X, y = read_data(responseagg=responseagg,
                         separation=separation,
                         quantile=0.666)
        #def wrapper(self, *args, **kwargs):
        #    return self.predict_proba(*args,**kwargs)[:,-1] # Last class is True
        #RandomForestClassifier.predict = wrapper # To avoid things inside permutation importance package  where it is only possible to invoke probabilistic prediction with twoclass y.
        #m = RandomForestClassifier(max_depth = 7, n_estimators = 1500, min_samples_split = 40, max_features = 35, n_jobs = njobs_per_imp)
        model = HybridExceedenceModel(fit_base_to_all_cv=True,
                                      max_depth=5,
                                      n_estimators=2500,
                                      min_samples_split=30,
                                      max_features=35,
                                      n_jobs=njobs_per_imp)
        ret = permute_importance(model,
                                 X_in=X,
                                 y_in=y,
                                 on_validation=False,
                                 evaluation_fn=brier_score_loss,
                                 n_folds=5,
                                 perm_imp_kwargs=dict(nimportant_vars=30,
                                                      njobs=njobs_per_imp,
                                                      nbootstrap=1500))
        retpath.mkdir(parents=True)
        pq.write_table(pa.Table.from_pandas(ret),
                       retpath / 'responsagg_separation.parquet')
        logging.debug(
            f'subprocess has written out importance frame at {retpath}')
    else:
        logging.debug(f'importance frame at {retpath} already exists')
Example #57
0
    def _write_multiple_hdfs_pq_files(self, tmpdir):
        import pyarrow.parquet as pq
        nfiles = 10
        size = 5
        test_data = []
        for i in range(nfiles):
            df = test_parquet._test_dataframe(size, seed=i)

            df['index'] = np.arange(i * size, (i + 1) * size)

            # Hack so that we don't have a dtype cast in v1 files
            df['uint32'] = df['uint32'].astype(np.int64)

            path = pjoin(tmpdir, '{0}.parquet'.format(i))

            table = pa.Table.from_pandas(df, preserve_index=False)
            with self.hdfs.open(path, 'wb') as f:
                pq.write_table(table, f)

            test_data.append(table)

        expected = pa.concat_tables(test_data)
        return expected
Example #58
0
def test_columns_index_with_multi_index(tmpdir, engine):
    fn = os.path.join(str(tmpdir), 'test.parquet')
    index = pd.MultiIndex.from_arrays([np.arange(10), np.arange(10) + 1],
                                      names=['x0', 'x1'])
    df = pd.DataFrame(np.random.randn(10, 2), columns=['a', 'b'], index=index)
    df2 = df.reset_index(drop=False)

    if engine == 'fastparquet':
        fastparquet.write(fn, df, write_index=True)

        # fastparquet doesn't support multi-index
        with pytest.raises(ValueError):
            ddf = dd.read_parquet(fn, engine=engine)
    else:
        import pyarrow as pa
        pq.write_table(pa.Table.from_pandas(df), fn)

        # Pyarrow supports multi-index reads
        ddf = dd.read_parquet(fn, engine=engine)
        assert_eq(ddf, df)

        d = dd.read_parquet(fn, columns='a', engine=engine)
        assert_eq(d, df['a'])

        d = dd.read_parquet(fn, index=['a', 'b'], columns=['x0', 'x1'], engine=engine)
        assert_eq(d, df2.set_index(['a', 'b'])[['x0', 'x1']])

    # Just index
    d = dd.read_parquet(fn, index=False, engine=engine)
    assert_eq(d, df2)

    d = dd.read_parquet(fn, index=['a'], engine=engine)
    assert_eq(d, df2.set_index('a')[['b']])

    d = dd.read_parquet(fn, index=['x0'], engine=engine)
    assert_eq(d, df2.set_index('x0')[['a', 'b']])

    # Just columns
    d = dd.read_parquet(fn, columns=['x0', 'a'], engine=engine)
    assert_eq(d, df2.set_index('x1')[['x0', 'a']])

    # Both index and columns
    d = dd.read_parquet(fn, index=False, columns=['x0', 'b'], engine=engine)
    assert_eq(d, df2[['x0', 'b']])

    for index in ['x1', 'b']:
        d = dd.read_parquet(fn, index=index, columns=['x0', 'a'], engine=engine)
        assert_eq(d, df2.set_index(index)[['x0', 'a']])

    # Columns and index intersect
    for index in ['a', 'x0']:
        with pytest.raises(ValueError):
            d = dd.read_parquet(fn, index=index, columns=['x0', 'a'], engine=engine)

    # Series output
    for ind, col, sol_df in [(None, 'x0', df2.set_index('x1')),
                             (False, 'b', df2),
                             (False, 'x0', df2),
                             ('a', 'x0', df2.set_index('a')),
                             ('a', 'b', df2.set_index('a'))]:
        d = dd.read_parquet(fn, index=ind, columns=col, engine=engine)
        assert_eq(d, sol_df[col])
def _write_table(*args, **kwargs):
    import pyarrow.parquet as pq
    return pq.write_table(*args, **kwargs)
Example #60
0
def test_read_multiple_files(tmpdir):
    nfiles = 10
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)

        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)

        path = pjoin(dirpath, '{0}.parquet'.format(i))

        table = pa.Table.from_pandas(df)
        pq.write_table(table, path)

        test_data.append(table)
        paths.append(path)

    result = pq.read_multiple_files(paths)
    expected = pa.concat_tables(test_data)

    assert result.equals(expected)

    # Read with provided metadata
    metadata = pq.ParquetFile(paths[0]).metadata

    result2 = pq.read_multiple_files(paths, metadata=metadata)
    assert result2.equals(expected)

    result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema)
    assert result3.equals(expected)

    # Read column subset
    to_read = [result[0], result[3], result[6]]
    result = pa.localfs.read_parquet(
        dirpath, columns=[c.name for c in to_read])
    expected = pa.Table.from_arrays(to_read)
    assert result.equals(expected)

    # Test failure modes with non-uniform metadata
    bad_apple = _test_dataframe(size, seed=i).iloc[:, :4]
    bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath

    t = pa.Table.from_pandas(bad_apple)
    pq.write_table(t, bad_apple_path)

    bad_meta = pq.ParquetFile(bad_apple_path).metadata

    with pytest.raises(ValueError):
        pq.read_multiple_files(paths + [bad_apple_path])

    with pytest.raises(ValueError):
        pq.read_multiple_files(paths, metadata=bad_meta)

    mixed_paths = [bad_apple_path, paths[0]]

    with pytest.raises(ValueError):
        pq.read_multiple_files(mixed_paths, schema=bad_meta.schema)

    with pytest.raises(ValueError):
        pq.read_multiple_files(mixed_paths)