Beispiel #1
0
    def bench_read(self, niter=5):
        cases = [
            ('parquet (UNC)', 'arrow Table',
             lambda: pq.read_table(self.parquet_unc_path, memory_map=False)),
            ('parquet (UNC)', 'pandas', lambda:
             (pq.read_table(self.parquet_unc_path, memory_map=False).to_pandas(
             ))),
            ('parquet (SNAPPY)', 'arrow Table',
             lambda: pq.read_table(self.parquet_snappy_path, memory_map=False)
             ),
            ('parquet (SNAPPY)', 'pandas', lambda: (pq.read_table(
                self.parquet_snappy_path, memory_map=False).to_pandas())),
            ('feather V2 (UNC)', 'pandas', lambda: feather.read_feather(
                self.feather_unc_path, memory_map=False)),
            ('feather V2 (LZ4)', 'pandas', lambda: feather.read_feather(
                self.feather_lz4_path, memory_map=False)),
            ('feather V2 (ZSTD)', 'pandas', lambda: feather.read_feather(
                self.feather_zstd_path, memory_map=False)),
            ('feather V2 (UNC)', 'arrow Table', lambda: feather.read_table(
                self.feather_unc_path, memory_map=False)),
            ('feather V2 (LZ4)', 'arrow Table', lambda: feather.read_table(
                self.feather_lz4_path, memory_map=False)),
            ('feather V2 (ZSTD)', 'arrow Table', lambda: feather.read_table(
                self.feather_zstd_path, memory_map=False)),
        ]

        return self._bench_cases(cases, niter)
Beispiel #2
0
def test_read_table(version):
    num_values = (100, 100)
    path = random_path()

    TEST_FILES.append(path)

    values = np.random.randint(0, 100, size=num_values)

    df = pd.DataFrame(values, columns=['col_' + str(i)
                                       for i in range(100)])
    write_feather(df, path, version=version)

    data = pd.DataFrame(values,
                        columns=['col_' + str(i) for i in range(100)])
    table = pa.Table.from_pandas(data)

    result = read_table(path)
    assert_frame_equal(table.to_pandas(), result.to_pandas())

    # Test without memory mapping
    result = read_table(path, memory_map=False)
    assert_frame_equal(table.to_pandas(), result.to_pandas())

    result = read_feather(path, memory_map=False)
    assert_frame_equal(table.to_pandas(), result)
Beispiel #3
0
def read_feather(path, *args, **kwargs):
    """{docstring}"""

    warnings.warn("Using CPU via PyArrow to read feather dataset, this may "
                  "be GPU accelerated in the future")
    pa_table = feather.read_table(path, *args, **kwargs)
    return DataFrame.from_arrow(pa_table)
def test_read_column_duplicated_in_file(tempdir):
    # duplicated columns in feather file (only works for feather v2)
    table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'a'])
    path = str(tempdir / "data.feather")
    write_feather(table, path, version=2)

    # no selection works fine
    result = read_table(path)
    assert result.equals(table)

    # selection with indices works
    result = read_table(path, columns=[0, 2])
    assert result.column_names == ['a', 'a']

    # selection with column names errors
    with pytest.raises(ValueError):
        read_table(path, columns=['a', 'b'])
Beispiel #5
0
def test_read_column_duplicated_selection(tempdir, version):
    # duplicated columns in the column selection
    table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'c'])
    path = str(tempdir / "data.feather")
    write_feather(table, path, version=version)

    for col_selection in [['a', 'b', 'a'], [0, 1, 0]]:
        result = read_table(path, columns=col_selection)
        assert result.column_names == ['a', 'b', 'a']
Beispiel #6
0
 def table(self):
     if self._table is None:
         path = self.temp_path("feather", "lz4")
         if path.exists():
             self._table = feather.read_table(path, memory_map=False)
         else:
             self._table = pyarrow.Table.from_pandas(
                 self.dataframe,
                 preserve_index=False,
             ).replace_schema_metadata(None)
     return self._table
def _check_arrow_roundtrip(table, path=None, compression=None):
    if path is None:
        path = random_path()

    TEST_FILES.append(path)
    write_feather(table, path, compression=compression)
    if not os.path.exists(path):
        raise Exception('file not written')

    result = read_table(path)
    assert result.equals(table)
Beispiel #8
0
    def read_table(self, source: tp.BinaryIO,
                   schema: tp.Optional[pa.Schema]) -> pa.Table:

        try:
            columns = schema.names if schema else None
            return pa_ft.read_table(source, columns)

        except pa.ArrowInvalid as e:
            err = f"Arrow file decoding failed, content is garbled"
            self._log.exception(err)
            raise _ex.EDataCorruption(err) from e
Beispiel #9
0
 def _read_col_from_path(self, path):
     # print("readcol: trying to read from path: ", path)
     # df = pf.read_table(path).to_pandas()
     np_path = path + '.npy'
     feather_path = path + '.feather'
     if os.path.exists(np_path):
         return np.load(np_path)
     elif os.path.exists(feather_path):
         df = pf.read_table(feather_path).to_pandas()
         return df[df.columns[0]]
     # print("neither path existed!", np_path, feather_path)
     return None  # neither path exists
Beispiel #10
0
def test_feather_v017_experimental_compression_backward_compatibility(datadir):
    # ARROW-11163 - ensure newer pyarrow versions can read the old feather
    # files from version 0.17.0 with experimental compression support (before
    # it was officially added to IPC format in 1.0.0)

    # file generated with:
    #     table = pa.table({'a': range(5)})
    #     from pyarrow import feather
    #     feather.write_feather(
    #         table, "v0.17.0.version=2-compression=lz4.feather",
    #         compression="lz4", version=2)
    expected = pa.table({'a': range(5)})
    result = read_table(datadir / "v0.17.0.version=2-compression=lz4.feather")
    assert result.equals(expected)
Beispiel #11
0
    def _get_benchmark_function(self, source, case):
        file_type, compression, output_type = case
        path = source.create_if_not_exists(file_type, compression)

        if file_type == "parquet" and output_type == "table":
            f = lambda: parquet.read_table(path, memory_map=False)
        elif file_type == "parquet" and output_type == "dataframe":
            f = lambda: parquet.read_table(path, memory_map=False).to_pandas()
        elif file_type == "feather" and output_type == "table":
            f = lambda: feather.read_table(path, memory_map=False)
        elif file_type == "feather" and output_type == "dataframe":
            f = lambda: feather.read_feather(path, memory_map=False)

        return f
Beispiel #12
0
def convert_apache_arrow_feather_to_apache_parquet(
        data_path: InputPath('ApacheArrowFeather'),
        output_data_path: OutputPath('ApacheParquet'),
):
    '''Converts Apache Arrow Feather to Apache Parquet.

    [Apache Arrow Feather](https://arrow.apache.org/docs/python/feather.html)
    [Apache Parquet](https://parquet.apache.org/)

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    from pyarrow import feather, parquet

    table = feather.read_table(data_path)
    parquet.write_table(table, output_data_path)
    def test_feather_auto_chunked(self):
        from pyarrow.feather import read_table, write_feather

        x = np.arange(2048).reshape(1024, 2)
        s = TensorArray(x)
        df = pd.DataFrame({"i": list(range(len(s))), "tensor": s})

        table = pa.Table.from_pandas(df)

        # Write table to feather and read back as a DataFrame
        with tempfile.TemporaryDirectory() as dirpath:
            filename = os.path.join(dirpath, "tensor_array_chunked_test.feather")
            write_feather(table, filename, chunksize=512)
            table = read_table(filename)
            self.assertGreaterEqual(table.column("tensor").num_chunks, 2)
            df_read = pd.read_feather(filename)
            pd.testing.assert_frame_equal(df, df_read)
Beispiel #14
0
def test_smry2arrow(testdata_folder: Path, tmp_path: Path) -> None:

    eclbase = (testdata_folder / "01_drogon_ahm" / "realization-0" / "iter-0" /
               "eclipse" / "model" / "DROGON-0").resolve()
    assert eclbase.with_suffix(".UNSMRY").exists()

    output_file = tmp_path / "output.arrow"

    ert_config_file = _create_minimal_ert_config_file(
        tmp_path, f"SMRY2ARROW(<ECLBASE>={eclbase})")
    output_file = tmp_path / "output" / "share" / "results" / "tables" / "unsmry.arrow"
    subprocess.check_output(["ert", "test_run", ert_config_file],
                            cwd=tmp_path)  # nosec

    assert output_file.exists()

    table = feather.read_table(output_file)
    assert table.shape == (243, 921)

    sample_date = table["DATE"][0]
    assert sample_date.type == pa.timestamp("ms")

    schema = table.schema
    field = schema.field("FOPT")
    field_meta = json.loads(field.metadata[b"smry_meta"])
    assert field.type == pa.float32()
    assert field_meta["unit"] == "SM3"
    assert field_meta["is_total"] == True
    assert field_meta["is_rate"] == False
    assert field_meta["is_historical"] == False

    field = schema.field("FOPR")
    field_meta = json.loads(field.metadata[b"smry_meta"])
    assert field.type == pa.float32()
    assert field_meta["unit"] == "SM3/DAY"
    assert field_meta["is_total"] == False
    assert field_meta["is_rate"] == True
    assert field_meta["is_historical"] == False

    field = schema.field("FOPTH")
    field_meta = json.loads(field.metadata[b"smry_meta"])
    assert field.type == pa.float32()
    assert field_meta["unit"] == "SM3"
    assert field_meta["is_total"] == True
    assert field_meta["is_rate"] == False
    assert field_meta["is_historical"] == True
 def from_feather(self,columns):
     """
     from_feather transform feather to Time_Series_Data or Time_Series_Collection
     
     Parameters
     ----------
     columns : list of str
         column names to fetch
     
     Returns
     -------
     Time_Series_Data or Time_Series_Collection
     """
     table = pf.read_table(
         source= self.dirPaths,
         columns = columns
     )
     return from_arrow_table(table,self.timeSeriesCol,self.mainCategoryCol)
Beispiel #16
0
def test_missing_metadata(tmp_path):
    df = geopandas.read_file(
        geopandas.datasets.get_path("naturalearth_lowres"))
    path = tmp_path / "test.feather"

    # convert to DataFrame with wkb -> writing to feather will have only pandas metadata
    df = df.to_wkb()
    df.to_feather(path)

    with pytest.raises(ValueError, match="Missing geo metadata"):
        dask_geopandas.read_feather(path)

    # remove metadata completely
    from pyarrow import feather

    table = feather.read_table(path)
    feather.write_feather(table.replace_schema_metadata(), path)

    with pytest.raises(ValueError, match="Missing geo metadata"):
        dask_geopandas.read_feather(path)
Beispiel #17
0
    def test_read_table(self):
        num_values = (100, 100)
        path = random_path()

        self.test_files.append(path)
        writer = FeatherWriter()
        writer.open(path)

        values = np.random.randint(0, 100, size=num_values)

        for i in range(100):
            writer.write_array('col_' + str(i), values[:, i])

        writer.close()

        data = pd.DataFrame(values,
                            columns=['col_' + str(i) for i in range(100)])
        table = pa.Table.from_pandas(data)

        result = read_table(path)

        assert_frame_equal(table.to_pandas(), result.to_pandas())
Beispiel #18
0
    def test_read_table(self):
        num_values = (100, 100)
        path = random_path()

        self.test_files.append(path)
        writer = FeatherWriter()
        writer.open(path)

        values = np.random.randint(0, 100, size=num_values)

        for i in range(100):
            writer.write_array('col_' + str(i), values[:, i])

        writer.close()

        data = pd.DataFrame(values,
                            columns=['col_' + str(i) for i in range(100)])
        table = pa.Table.from_pandas(data)

        result = read_table(path)

        assert_frame_equal(table.to_pandas(), result.to_pandas())
Beispiel #19
0
def test_use_threads(version):
    # ARROW-14470
    num_values = (10, 10)
    path = random_path()

    TEST_FILES.append(path)

    values = np.random.randint(0, 10, size=num_values)
    columns = ['col_' + str(i) for i in range(10)]
    table = pa.Table.from_arrays(values, columns)

    write_feather(table, path, version=version)

    result = read_feather(path)
    assert_frame_equal(table.to_pandas(), result)

    # Test read_feather with use_threads=False
    result = read_feather(path, use_threads=False)
    assert_frame_equal(table.to_pandas(), result)

    # Test read_table with use_threads=False
    result = read_table(path, use_threads=False)
    assert result.equals(table)
Beispiel #20
0
def _from_geofeather(path, columns=None):
    """Deserialize a pandas.DataFrame stored in a feather file.

    If the corresponding .crs file is found, it is used to set the CRS of
    the GeoDataFrame.

    Parameters
    ----------
    path : str
        path to feather file to read
    columns : list-like (optional, default: None)
        Subset of columns to read from the file, must include 'geometry'.  If not provided,
        all columns are read.

    Returns
    -------
    tuple of (pandas.DataFrame, dict or str)
        DataFrame will contain a "geometry" or "wkb" column with WKB-encoded geometry data.
        crs will be a dict or str depending on what was serialized.
    """
    crs = None
    crsfilename = "{}.crs".format(path)
    if os.path.exists(crsfilename):
        crs = json.loads(open(crsfilename).read())
        if "wkt" in crs:
            crs = crs["wkt"]
        elif "proj4" in crs:
            crs = crs["proj4"]
    else:
        warnings.warn(
            "{} coordinate reference system file is missing. No crs will be set for this GeoDataFrame."
            .format(crsfilename))

    # TODO: use geopandas feather I/O instead

    return read_table(path, columns=columns).to_pandas(), crs
def convert_feather_v1_to_v2_vice_versa(
    input_ct_db_filename: str,
    output_ct_db_filename: str,
    compression: Optional[str] = "zstd",
    compression_level: int = 6,
    to_version: int = 2,
):
    """
    Convert cisTarget Feather database from Feather v1 to v2 format (with or without compression) and vice versa.

    :param input_ct_db_filename: input cisTarget database filename.
    :param output_ct_db_filename: output cisTarget database filename.
    :param compression: Compression method: "zstd" (default), "lz4" or "uncompressed".
    :param compression_level: Compression level for "zstd" or "lz4".
    :param to_version: Output Feather file format version: 1 (legacy) or 2 (default).
    :return:
    """

    if to_version != 2 and to_version != 1:
        raise ValueError(
            "Feather file version only supports 1 (legacy) or 2 (default).")

    if to_version == 1:
        # Compression is not supported in Feather v1 format.
        compression = "uncompressed"
        compression_level = None

    if compression not in {"zstd", "lz4", "uncompressed"}:
        raise ValueError(
            f'Unsupported compression value "{compression}". Choose "zstd" (default), "lz4" or "uncompressed".'
        )

    # Read input cisTarget database as a pyarrow Table.
    df_pa_table = pf.read_table(source=input_ct_db_filename, )

    # Get all column names.
    all_column_names = df_pa_table.column_names

    try:
        # Check if we have an old database that still used a "features" column and rename it.
        features_idx = all_column_names.index("features")

        # Get column which contains motif or track names.
        motifs_or_track_names = df_pa_table.column(features_idx)

        if pc.sum(pc.starts_with(motifs_or_track_names, "jaspar")).as_py() > 0:
            # It is a motif vs genes/regions database if JASPAR motif names were found in the "features" column.
            all_column_names[features_idx] = "motifs"
        else:
            all_column_names[features_idx] = "tracks"

        df_pa_table.drop(["features"])
        # Rename features column in database to "motifs" or "tracks".
        df_pa_table = df_pa_table.rename_columns(all_column_names)
    except ValueError:
        # No old database (with "features" column).
        pass

    # Get database index column ("motifs", "tracks", "regions" or "genes" depending of the database type).
    for column_idx, column_name in enumerate(all_column_names):
        if column_name in {"motifs", "tracks", "regions", "genes"}:
            index_column = df_pa_table.column(column_idx)
            break

    # Sort column names (non-index columns) and add index column as last column.
    column_names_sorted_and_index = sorted([
        column_name for column_name in all_column_names
        if column_name not in index_column._name
    ])
    column_names_sorted_and_index.append(index_column._name)

    # Create a new pyarrow Table with columns in the new order.
    df_pa_table = df_pa_table.select(column_names_sorted_and_index)

    # Writhe cisTarget database to a new Feather file with the requested compression/version settings.
    pf.write_feather(df=df_pa_table,
                     dest=output_ct_db_filename,
                     compression=compression,
                     compression_level=compression_level,
                     version=to_version)
def test_export_feather(tmpdir_factory):
    """Test export of DataFrame to feather"""
    Settings.tidy = False
    Settings.humanize = True
    Settings.si_units = False

    # Request data
    request = DwdObservationRequest(
        parameter=DwdObservationDataset.CLIMATE_SUMMARY,
        resolution=DwdObservationResolution.DAILY,
        start_date="2019",
        end_date="2020",
    ).filter_by_station_id(
        station_id=[1048],
    )

    df = request.values.all().df

    # Save to Feather file.
    filename = tmpdir_factory.mktemp("data").join("observations.feather")
    ExportMixin(df=df).to_target(f"file://{filename}")

    # Read back Feather file.
    table = feather.read_table(filename)

    # Validate dimensions.
    assert table.num_columns == 19
    assert table.num_rows == 366

    # Validate column names.
    assert table.column_names == [
        "station_id",
        "dataset",
        "date",
        "qn_3",
        "wind_gust_max",
        "wind_speed",
        "qn_4",
        "precipitation_height",
        "precipitation_form",
        "sunshine_duration",
        "snow_depth",
        "cloud_cover_total",
        "pressure_vapor",
        "pressure_air_site",
        "temperature_air_mean_200",
        "humidity",
        "temperature_air_max_200",
        "temperature_air_min_200",
        "temperature_air_min_005",
    ]

    # Validate content.
    data = table.to_pydict()

    assert data["date"][0] == datetime.datetime(2019, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)
    assert data["temperature_air_min_005"][0] == 1.5
    assert data["date"][-1] == datetime.datetime(2020, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)
    assert data["temperature_air_min_005"][-1] == -4.6

    os.unlink(filename)
Beispiel #23
0
def test_feather_without_pandas(tempdir, version):
    # ARROW-8345
    table = pa.table([pa.array([1, 2, 3])], names=['f0'])
    write_feather(table, str(tempdir / "data.feather"), version=version)
    result = read_table(str(tempdir / "data.feather"))
    assert result.equals(table)
Beispiel #24
0
 def _read_col_from_path(self, path):
     df = pf.read_table(path).to_pandas()
     return df[df.columns[0]]