Ejemplo n.º 1
0
def _setup_influxdb_server(inst):
    inst.influxd_inst = InfluxDbInstance(
        inst.influxdb_template_conf,
        udp_enabled=getattr(inst, 'influxdb_udp_enabled', False),
    )

    inst.cli = InfluxDBClient('localhost',
                              inst.influxd_inst.http_port,
                              'root',
                              '',
                              database='db')
    if not using_pypy:
        inst.cliDF = DataFrameClient('localhost',
                                     inst.influxd_inst.http_port,
                                     'root',
                                     '',
                                     database='db')
Ejemplo n.º 2
0
    def export(self, target: str):
        """
        Emit Pandas DataFrame to target. A target
        is identified by a connection string.

        Examples:

        - duckdb://dwd.duckdb?table=weather
        - influxdb://localhost/?database=dwd&table=weather
        - crate://localhost/?database=dwd&table=weather

        Dispatch data to different data sinks. Currently, SQLite, DuckDB,
        InfluxDB and CrateDB are implemented. However, through the SQLAlchemy
        layer, it should actually work with any supported SQL database.

        - https://docs.sqlalchemy.org/en/13/dialects/

        :param target: Target connection string.
        :return: self
        """

        database, tablename = ConnectionString(target).get()

        if target.startswith("duckdb://"):
            """
            ====================
            DuckDB database sink
            ====================

            Install Python driver::

                pip install duckdb

            Acquire data::

                wetterdienst readings --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="duckdb:///dwd.duckdb?table=weather"

            Example queries::

                python -c 'import duckdb; c = duckdb.connect(database="dwd.duckdb"); print(c.table("weather"))'  # noqa
                python -c 'import duckdb; c = duckdb.connect(database="dwd.duckdb"); print(c.execute("SELECT * FROM weather").df())'  # noqa

            """
            log.info(f"Writing to DuckDB {database, tablename}")
            import duckdb

            connection = duckdb.connect(database=database, read_only=False)
            connection.register("origin", self.df)
            connection.execute(f"DROP TABLE IF EXISTS {tablename};")
            connection.execute(
                f"CREATE TABLE {tablename} AS SELECT * FROM origin;"  # noqa:S608
            )

            weather_table = connection.table(tablename)
            print(weather_table)
            print("Cardinalities:")
            print(weather_table.to_df().count())
            connection.close()
            log.info("Writing to DuckDB finished")

        elif target.startswith("influxdb://"):
            """
            ======================
            InfluxDB database sink
            ======================

            Install Python driver::

                pip install influxdb

            Run database::

                docker run --publish "8086:8086" influxdb/influxdb:1.8.2

            Acquire data::

                wetterdienst readings --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="influxdb://localhost/?database=dwd&table=weather"

            Example queries::

                http 'localhost:8086/query?db=dwd&q=SELECT * FROM weather;'
                http 'localhost:8086/query?db=dwd&q=SELECT COUNT(*) FROM weather;'
            """
            log.info(f"Writing to InfluxDB {database, tablename}")
            from influxdb.dataframe_client import DataFrameClient

            # Setup the connection.
            c = DataFrameClient(database=database)
            c.create_database(database)

            # Mungle the data frame.
            df = self.df.set_index(pd.DatetimeIndex(self.df["date"]))
            df = df.drop(["date"], axis=1)
            df = df.dropna()

            # Write to InfluxDB.
            c.write_points(
                dataframe=df,
                measurement=tablename,
                tag_columns=["station_id", "parameter", "element"],
            )
            log.info("Writing to InfluxDB finished")

        elif target.startswith("crate://"):
            """
            =====================
            CrateDB database sink
            =====================

            Install Python driver::

                pip install crate[sqlalchemy] crash

            Run database::

                docker run --publish "4200:4200" --env CRATE_HEAP_SIZE=512M crate/crate:4.2.4

            Acquire data::

                wetterdienst readings --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="crate://localhost/?database=dwd&table=weather"

            Example queries::

                crash -c 'select * from weather;'
                crash -c 'select count(*) from weather;'
                crash -c "select *, date_format('%Y-%m-%dT%H:%i:%s.%fZ', date) as datetime from weather order by datetime limit 10;"  # noqa

            """
            log.info("Writing to CrateDB")
            self.df.to_sql(
                name=tablename,
                con=target,
                if_exists="replace",
                index=False,
                method="multi",
                chunksize=5000,
            )
            log.info("Writing to CrateDB finished")

        else:
            """
            ========================
            SQLAlchemy database sink
            ========================

            Install Python driver::

                pip install sqlalchemy

            Examples::

                # Prepare
                alias fetch='wetterdienst readings --station=1048,4411 --parameter=kl --resolution=daily --period=recent'

                # Acquire data.
                fetch --target="sqlite:///dwd.sqlite?table=weather"

                # Query data.
                sqlite3 dwd.sqlite "SELECT * FROM weather;"

            """
            log.info("Writing to SQL database")
            self.df.to_sql(
                name=tablename,
                con=target,
                if_exists="replace",
                index=False,
                method="multi",
                chunksize=5000,
            )
            log.info("Writing to SQL database finished")
Ejemplo n.º 3
0
    def to_target(self, target: str):
        """
        Emit Pandas DataFrame to target. A target
        is identified by a connection string.

        Examples:

        - duckdb://dwd.duckdb?table=weather
        - influxdb://localhost/?database=dwd&table=weather
        - crate://localhost/?database=dwd&table=weather

        Dispatch data to different data sinks. Currently, SQLite, DuckDB,
        InfluxDB and CrateDB are implemented. However, through the SQLAlchemy
        layer, it should actually work with any supported SQL database.

        - https://docs.sqlalchemy.org/en/13/dialects/

        :param target: Target connection string.
        :return: self
        """

        log.info(f"Exporting records to {target}\n{self.df.count()}")

        t = ConnectionString(target)
        database = t.get_database()
        tablename = t.get_table()

        if target.startswith("file://"):
            filepath = t.get_path()

            if target.endswith(".xlsx"):
                log.info(f"Writing to spreadsheet file '{filepath}'")

                # Convert all datetime columns to ISO format.
                df = convert_datetimes(self.df)
                df.to_excel(filepath, index=False)

            elif target.endswith(".feather"):
                # https://arrow.apache.org/docs/python/feather.html
                log.info(f"Writing to Feather file '{filepath}'")
                import pyarrow.feather as feather

                feather.write_feather(self.df, filepath, compression="lz4")

            elif target.endswith(".parquet"):
                """
                # Acquire data and store to Parquet file.
                alias fetch="wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent"
                fetch --target="file://observations.parquet"

                # Check Parquet file.
                parquet-tools schema observations.parquet
                parquet-tools head observations.parquet

                # References
                - https://arrow.apache.org/docs/python/parquet.html
                """

                log.info(f"Writing to Parquet file '{filepath}'")
                import pyarrow as pa
                import pyarrow.parquet as pq

                table = pa.Table.from_pandas(self.df)
                pq.write_table(table, filepath)

            elif target.endswith(".zarr"):
                """
                # Acquire data and store to Zarr group.
                alias fetch="wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent"
                fetch --target="file://observations.zarr"

                # References
                - https://xarray.pydata.org/en/stable/generated/xarray.Dataset.from_dataframe.html
                - https://xarray.pydata.org/en/stable/generated/xarray.Dataset.to_zarr.html
                """

                log.info(f"Writing to Zarr group '{filepath}'")
                import xarray

                df = self.df

                # Problem: `ValueError: Cannot setitem on a Categorical with a new category, set the categories first`.
                # Solution: Let's convert all categorical columns back to their designated type representations.
                #           https://stackoverflow.com/questions/32011359/convert-categorical-data-in-pandas-dataframe/32011969#32011969
                if "quality" in df:
                    df.quality = df.quality.astype("Int64")
                categorical_columns = df.select_dtypes(["category"]).columns
                df[categorical_columns] = df[categorical_columns].astype("str")

                # Problem: `TypeError: float() argument must be a string or a number, not 'NAType'`.
                # Solution: Fill gaps in the data.
                df = df.fillna(-999)

                # Convert pandas DataFrame to xarray Dataset.
                dataset = xarray.Dataset.from_dataframe(df)
                log.info(f"Converted to xarray Dataset. Size={dataset.sizes}")

                # Export to Zarr format.
                # TODO: Add "group" parameter.
                #       Group path. (a.k.a. `path` in zarr terminology.)
                # TODO: Also use attributes: `store.set_attribute()`
                store = dataset.to_zarr(
                    filepath,
                    mode="w",
                    group=None,
                    encoding={"date": {
                        "dtype": "datetime64"
                    }},
                )

                # Reporting.
                dimensions = store.get_dimensions()
                variables = list(store.get_variables().keys())

                log.info(
                    f"Wrote Zarr file with dimensions={dimensions} and variables={variables}"
                )
                log.info(f"Zarr Dataset Group info:\n{store.ds.info}")

            else:
                raise KeyError("Unknown export file type")

            return

        if target.startswith("duckdb://"):
            """
            ====================
            DuckDB database sink
            ====================

            Install Python driver::

                pip install duckdb

            Acquire data::

                wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="duckdb:///dwd.duckdb?table=weather"

            Example queries::

                python -c 'import duckdb; c = duckdb.connect(database="dwd.duckdb"); print(c.table("weather"))'  # noqa
                python -c 'import duckdb; c = duckdb.connect(database="dwd.duckdb"); print(c.execute("SELECT * FROM weather").df())'  # noqa

            """
            log.info(
                f"Writing to DuckDB. database={database}, table={tablename}")
            import duckdb

            connection = duckdb.connect(database=database, read_only=False)
            connection.register("origin", self.df)
            connection.execute(f"DROP TABLE IF EXISTS {tablename};")
            connection.execute(
                f"CREATE TABLE {tablename} AS SELECT * FROM origin;"  # noqa:S608
            )

            weather_table = connection.table(tablename)
            print(weather_table)
            print("Cardinalities:")
            print(weather_table.to_df().count())
            connection.close()
            log.info("Writing to DuckDB finished")

        elif target.startswith("influxdb://"):
            """
            ======================
            InfluxDB database sink
            ======================

            Install Python driver::

                pip install influxdb

            Run database::

                docker run -it --rm --publish=8086:8086 influxdb:1.8

            Acquire data::

                wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="influxdb://localhost/?database=dwd&table=weather"

            Example queries::

                http 'localhost:8086/query?db=dwd&q=SELECT * FROM weather;'
                http 'localhost:8086/query?db=dwd&q=SELECT COUNT(*) FROM weather;'
            """
            log.info(
                f"Writing to InfluxDB. database={database}, table={tablename}")
            from influxdb.dataframe_client import DataFrameClient

            # 1. Mungle the data frame.
            # Use the "date" column as appropriate timestamp index.
            df = self.df.set_index(pd.DatetimeIndex(self.df["date"]))
            df = df.drop(["date"], axis=1)

            # Work around `ValueError: fill value must be in categories`.
            # The reason is that the InfluxDB Pandas adapter tries to apply
            # `tag_df.fillna('')  # replace NA with empty string`.
            # However, it is not possible to apply `.fillna` to categorical
            # columns. See:
            # - https://github.com/pandas-dev/pandas/issues/24079
            # - https://stackoverflow.com/questions/65316023/filling-np-nan-entries-of-float-column-gives-valueerror-fill-value-must-be-in-c/65316190
            # - https://stackoverflow.com/questions/53664948/pandas-fillna-throws-valueerror-fill-value-must-be-in-categories
            # - https://stackoverflow.com/questions/32718639/pandas-filling-nans-in-categorical-data/44633307
            #
            # So, let's convert all categorical columns back to their designated type representations.
            # https://stackoverflow.com/questions/32011359/convert-categorical-data-in-pandas-dataframe/32011969#32011969
            if "quality" in df:
                df.quality = df.quality.astype("Int64")
            categorical_columns = df.select_dtypes(["category"]).columns
            df[categorical_columns] = df[categorical_columns].astype("str")

            # When using the tidy format, don't export empty records.
            # Otherwise, the InfluxDB dataframe driver adapter will croak.
            if df.attrs.get("tidy"):
                df = df.dropna()

            # Compute designated tag fields from some candidates.
            tag_columns = []
            tag_candidates = [
                Columns.STATION_ID.value,
                Columns.QUALITY.value,
                Columns.QUALITY_PREFIX.value,
                Columns.DATASET.value,
                Columns.PARAMETER.value,
            ]
            for tag_candidate in tag_candidates:
                tag_candidate = tag_candidate.lower()
                for column in df.columns:
                    if column.startswith(tag_candidate):
                        tag_columns.append(column)

            # Setup the connection.
            c = DataFrameClient(database=database)
            c.create_database(database)

            # Need pandas>=1.2, otherwise InfluxDB's `field_df = dataframe[field_columns].replace([np.inf, -np.inf], np.nan)`
            # will erroneously cast `Int64` to `object`, so `int_columns = df.select_dtypes(include=['integer']).columns`
            # will fail.
            # https://github.com/pandas-dev/pandas/issues/32988

            # Write to InfluxDB.
            c.write_points(
                dataframe=df,
                measurement=tablename,
                tag_columns=tag_columns,
                batch_size=50000,
            )
            log.info("Writing to InfluxDB finished")

        elif target.startswith("crate://"):
            """
            =====================
            CrateDB database sink
            =====================

            Install Python driver::

                pip install crate[sqlalchemy] crash

            Run database::

                docker run -it --rm --publish=4200:4200 --env CRATE_HEAP_SIZE=2048M crate/crate:nightly

            Acquire data::

                wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="crate://crate@localhost/dwd?table=weather"

            Example queries::

                psql postgres://crate@localhost --command 'SELECT * FROM dwd.weather;'

                crash -c 'select * from dwd.weather;'
                crash -c 'select count(*) from dwd.weather;'
                crash -c "select *, date_format('%Y-%m-%dT%H:%i:%s.%fZ', date) as datetime from dwd.weather order by datetime limit 10;"  # noqa

            """
            log.info(f"Writing to CrateDB. target={target}, table={tablename}")

            # CrateDB's SQLAlchemy driver doesn't accept `database` or `table` query parameters.
            cratedb_url = t.url._replace(path="", query=None)
            cratedb_target = urlunparse(cratedb_url)

            # Convert timezone-aware datetime fields to naive ones.
            # FIXME: Omit this as soon as the CrateDB driver is capable of supporting timezone-qualified timestamps.
            self.df.date = self.df.date.dt.tz_localize(None)
            # self.df.date = self.df.date.dt.tz_convert(None)

            self.df.to_sql(
                name=tablename,
                con=cratedb_target,
                schema=database,
                if_exists="replace",
                index=False,
                # method="multi",
                chunksize=5000,
            )
            log.info("Writing to CrateDB finished")

        else:
            """
            ================================
            Generic SQLAlchemy database sink
            ================================

            Install Python driver::

                pip install sqlalchemy

            Examples::

                # Prepare
                alias fetch='wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent'

                # Acquire data.
                fetch --target="sqlite:///dwd.sqlite?table=weather"

                # Query data.
                sqlite3 dwd.sqlite "SELECT * FROM weather;"

            """

            # Honour SQLite's SQLITE_MAX_VARIABLE_NUMBER, which defaults to 999
            # for SQLite versions prior to 3.32.0 (2020-05-22),
            # see https://www.sqlite.org/limits.html#max_variable_number.
            chunksize = 5000
            if target.startswith("sqlite://"):
                import sqlite3

                if sqlite3.sqlite_version_info < (3, 32, 0):
                    chunksize = int(999 / len(self.df.columns))

            log.info("Writing to SQL database")
            self.df.to_sql(
                name=tablename,
                con=target,
                if_exists="replace",
                index=False,
                method="multi",
                chunksize=chunksize,
            )
            log.info("Writing to SQL database finished")