Beispiel #1
0
def test_concat_timestamp():
    df1 = pa.Table.from_arrays(
        [pa.array(['2020-01-31', '2020-01-31']).cast('timestamp[us]')],
        names=['ts'])
    df2 = pa.Table.from_arrays(
        [pa.array(['2020-12-31', '2020-12-31']).cast('timestamp[ns]')],
        names=['ts'])
    df1_vx = vaex.from_arrow_table(df1)
    df2_vx = vaex.from_arrow_table(df2)
    df = vaex.concat([df1_vx, df2_vx])
    assert df.ts.tolist() == df1['ts'].to_pylist() + df2['ts'].to_pylist()
    assert df.ts.dtype.internal == pa.timestamp('ns')
Beispiel #2
0
def read_file(path, convert=True, **kwargs):
    """Reads a generic spatial file.
    Parameters:
        path (string): The spatial file full path.
        convert (bool|string): Exports to arrow file when convert is a path. If True,
            ``arrow_path = path+'.arrow'``.
        **kwargs: Extra keyword arguments.
    Returns:
        (object) A GeoDataFrame object.
    """
    if not convert:
        table = pa.concat_tables(geovaex.io.to_arrow_table(path, **kwargs),
                                 promote=False)
        if table.schema.metadata is not None and b'geovaex version' in table.schema.metadata.keys(
        ):
            df = from_arrow_spatial_table(table)
            has_geometry = df.geometry.get_raw_geometry().null_count != len(
                df.geometry)
            if has_geometry:
                return df
            table = table.drop(['geometry'])

        warnings.warn('Not a spatial file. Returning a Vaex DataFrame.')
        df = from_arrow_table(table).copy()
        return df

    arrow_file = os.path.splitext(path)[0] + '.arrow' if convert else convert
    to_arrow(path, arrow_file, **kwargs)
    return open(arrow_file)
Beispiel #3
0
def test_partitioning_write_hdf5():
    shutil.rmtree(data_path / 'parquet_dataset_partitioned_vaex',
                  ignore_errors=True)
    df = vaex.from_arrow_table(table)
    df.export_partitioned(
        data_path /
        'parquet_dataset_partitioned_vaex_my_choice/{subdir}/{i}.hdf5',
        ['country'])
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_vaex_my_choice/*/*.hdf5'))
    ) == 3  # 3 unique values
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_vaex_my_choice/country=US/[012].hdf5'
                ))) == 1
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_vaex_my_choice/country=NL/[012].hdf5'
                ))) == 1
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_vaex_my_choice/country=FR/[012].hdf5'
                ))) == 1
Beispiel #4
0
def test_partitioning_write_parquet():
    shutil.rmtree(data_path / 'parquet_dataset_partitioned_vaex',
                  ignore_errors=True)
    df = vaex.from_arrow_table(table)
    df.export_partitioned(data_path / 'parquet_dataset_partitioned_vaex',
                          ['country', 'year'])
    df = vaex.open(data_path / 'parquet_dataset_partitioned_vaex',
                   partitioning="hive")
    assert len(
        glob.glob(
            str(data_path / 'parquet_dataset_partitioned_vaex/*/*/*.parquet'))
    ) == 5  # 5 unique values
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_vaex/country=US/year=2020/*.parquet'
                ))) == 1
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_vaex/country=NL/year=2020/*.parquet'
                ))) == 1
    # import pdb; pdb.set_trace()
    assert set(df.value.tolist()) == set(values)
    assert set(df.year.tolist()) == set(years)
    assert set(df.country.tolist()) == set(countries)
Beispiel #5
0
def open(path):
    """Opens an arrow spatial file.
    Parameters:
        path (string): The file's full path.
    Returns:
        (object) A GeoDataFrame object.
    """
    source = pa.memory_map(path)
    try:
        # first we try if it opens as stream
        reader = pa.ipc.open_stream(source)
    except pa.lib.ArrowInvalid:
        # if not, we open as file
        reader = pa.ipc.open_file(source)
        # for some reason this reader is not iterable
        batches = [
            reader.get_batch(i) for i in range(reader.num_record_batches)
        ]
    else:
        # if a stream, we're good
        batches = reader  # this reader is iterable
    table = pa.Table.from_batches(batches)
    if table.schema.metadata is not None and b'geovaex version' in table.schema.metadata.keys(
    ):
        metadata = table.schema.metadata
        print(f"Opened file {os.path.basename(path)}, "
              f"created by geovaex v{metadata[b'geovaex version'].decode()} "
              f"using {metadata[b'driver'].decode()} driver.")
        df = from_arrow_spatial_table(table)
        has_geometry = df.geometry.get_raw_geometry().null_count != len(
            df.geometry)
        if has_geometry:
            return df
        table = table.drop(['geometry'])

    warnings.warn('Not a spatial arrow file. Returning a Vaex DataFrame.')
    df = from_arrow_table(table).copy()
    return df
Beispiel #6
0
def test_arrow_write_table(tmpdir, as_stream):
    path = str(tmpdir.join('test.arrow'))
    vaex.from_arrow_table(table).export_arrow(path, as_stream=as_stream)
    df = vaex.open(path)
    assert 'col1' in df
Beispiel #7
0
def test_partitioning_write_directory():
    shutil.rmtree(data_path / 'parquet_dataset_partitioned_directory1',
                  ignore_errors=True)
    shutil.rmtree(data_path / 'parquet_dataset_partitioned_directory2',
                  ignore_errors=True)

    partitioning = pa.dataset.partitioning(
        pa.schema([("country", pa.string())]))

    df = vaex.from_arrow_table(table)
    df.export_partitioned(data_path / 'parquet_dataset_partitioned_directory1',
                          ['country'],
                          directory_format='{value}')
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_directory1/*/*.parquet'))
    ) == 3  # 3 unique values
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_directory1/US/*.parquet'))) == 1
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_directory1/NL/*.parquet'))) == 1
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_directory1/FR/*.parquet'))) == 1

    assert set(df.value.tolist()) == set(values)
    assert set(df.year.tolist()) == set(years)
    assert set(df.country.tolist()) == set(countries)

    # now with 2 keys
    partitioning = pa.dataset.partitioning(
        pa.schema([("year", pa.int64()), ("country", pa.string())]))
    df.export_partitioned(data_path / 'parquet_dataset_partitioned_directory2',
                          ['year', 'country'],
                          directory_format='{value}')
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_directory2/*/*/*.parquet'))
    ) == 5  # 5 unique values
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_directory2/2020/US/*.parquet'))
    ) == 1
    assert len(
        glob.glob(
            str(data_path /
                'parquet_dataset_partitioned_directory2/2020/NL/*.parquet'))
    ) == 1

    df = vaex.open(data_path / 'parquet_dataset_partitioned_directory2',
                   partitioning=partitioning)
    assert set(df.value.tolist()) == set(values)
    assert set(df.year.tolist()) == set(years)
    assert set(df.country.tolist()) == set(countries)