Python retrieve Examples, airqdata.utils.retrieve Python Examples

Example #1

0

Show file

    def get_stations(cls, **retrieval_kwargs):
        """Retrieve a list of measuring stations.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function
        """

        # Retrieve and reshape data
        stations = retrieve(stations_cache_file, API_ENDPOINTS["stations"],
                            "station metadata", **retrieval_kwargs)
        stations = (stations.drop(columns=["geometry.type", "type"]).rename(
            columns={
                "properties.id": "id",
                "properties.label": "label"
            }).set_index("id"))

        # Split coordinates into columns
        coords = pd.DataFrame(
            [row for row in stations["geometry.coordinates"]],
            index=stations.index)
        stations[["lat", "lon"]] = coords[[1, 0]]
        stations.drop(columns=["geometry.coordinates"], inplace=True)

        cls.stations = stations

Example #2

0

Show file

    def __init__(cls, **retrieval_kwargs):
        """Retrieve sensor information from the InfluencAir project.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function

        Raises:
            KeyError if sheet structure does not match listed columns
        """
        sensor_info = retrieve(cache_file=sensor_info_cache_file,
                               url=SENSOR_SHEET_DOWNLOAD_URL,
                               label="InfluencAir sensor information",
                               read_func=pd.read_csv,
                               read_func_kwargs={
                                   "header": 1,
                                   "dtype": "object"
                               },
                               call_rate_limiter=google_call_rate_limiter,
                               **retrieval_kwargs)
        try:
            sensor_info = (sensor_info[[
                "Chip ID", "PM Sensor ID", "Hum/Temp Sensor ID", "Label",
                "Address", "Floor", "Side (Street/Garden)"
            ]].rename(columns={"Side (Street/Garden)": "Side"}))
        except KeyError:
            raise KeyError("Could not get columns. Check if the structure or "
                           "labels of the InfluencAir sensor Google Sheet "
                           "have changed.")
        cls.sensors = sensor_info
        cls.initialized = True

Example #3

0

Show file

    def get_phenomena(cls, **retrieval_kwargs):
        """Retrieve a list of measured phenomena.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function
        """
        phenomena = retrieve(phenomena_cache_file, API_ENDPOINTS["phenomena"],
                             "phenomenon metadata", **retrieval_kwargs)
        phenomena["id"] = phenomena["id"].astype("int")
        phenomena = phenomena.set_index("id").sort_index()
        cls.phenomena = phenomena

Example #4

0

Show file

    def get_metadata(self, **retrieval_kwargs):
        """Get sensor metadata and current measurements from cache or
        luftdaten.info API.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function

        Warns:
            UserWarning if sensor does not appear to be online
        """

        # Get and cache metadata and measurements of past five minutes
        filename = os.path.basename(self.metadata_url.rstrip("/")) + ".json"
        filepath = os.path.join(cache_dir, filename)
        parsed = retrieve(
            cache_file=filepath,
            url=self.metadata_url,
            label=("sensor {} metadata from luftdaten.info".format(
                self.sensor_id)),
            call_rate_limiter=call_rate_limiter,
            **retrieval_kwargs)

        try:
            metadata = (parsed.drop(
                columns=["sensordatavalues", "timestamp"]).iloc[0])
        except (ValueError, AttributeError, KeyError):
            warnings.warn("Sensor metadata could not be retrieved")
        else:
            metadata.name = "metadata"
            self.metadata = metadata

            # Extract metadata into corresponding properties
            self.sensor_type = metadata["sensor.sensor_type.name"]
            self.lat = float(metadata["location.latitude"])
            self.lon = float(metadata["location.longitude"])
            self.label = "at " + utils.label_coordinates(self.lat, self.lon)

            # Extract most current measurements
            current = parsed["sensordatavalues"].iloc[-1]
            current = (json_normalize(current).replace({
                "P1": "pm10",
                "P2": "pm2.5"
            }).set_index("value_type")["value"])
            current = (pd.to_numeric(current).replace([999.9, 1999.9],
                                                      pd.np.nan))
            self.current_measurements = dict(current)
            self.phenomena = list(current.index)
            self.units = {
                phenomenon: UNITS[phenomenon]
                for phenomenon in UNITS if phenomenon in self.phenomena
            }

Example #5

0

Show file

    def get_time_series(cls, **retrieval_kwargs):
        """Retrieve information on available time series: a collection
        of station & phenomenon combinations.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function
        """
        def get_phenomenon_name(label):
            """Extract phenomenon name from time series label."""
            phenomenon_name_series_id = (label.split(sep=" - ", maxsplit=1)[0])
            phenomenon_name = phenomenon_name_series_id.rsplit(maxsplit=1)[0]
            return phenomenon_name

        # Retrieve and reshape data
        time_series = retrieve(cache_file=time_series_cache_file,
                               url=API_ENDPOINTS["time series"],
                               label="time series metadata",
                               call_rate_limiter=call_rate_limiter,
                               **retrieval_kwargs)
        time_series["id"] = time_series["id"].astype("int")
        time_series = (time_series.set_index("id").drop(
            columns=["station.geometry.type", "station.type"]).rename(
                columns={
                    "station.properties.id": "station_id",
                    "station.properties.label": "station_label",
                    "uom": "unit"
                }))

        # Extract phenomenon names from labels
        labels = time_series["label"]
        time_series["phenomenon"] = labels.apply(get_phenomenon_name)

        # Split coordinates into columns
        coords = pd.DataFrame(
            [row for row in time_series["station.geometry.coordinates"]],
            index=time_series.index)
        time_series[["station_lat", "station_lon"]] = coords[[1, 0]]

        # Sort and drop columns
        time_series = time_series[[
            "label", "phenomenon", "unit", "station_id", "station_label",
            "station_lat", "station_lon"
        ]]

        # Clean unit descriptors
        time_series["unit"] = (time_series["unit"].str.replace(
            "m3", "m³").str.replace("ug", "µg"))
        (time_series.loc[time_series["phenomenon"] == "temperature",
                         "unit"]) = "°C"

        cls.time_series = time_series

Example #6

0

Show file

    def get_latest_measurement(self, **retrieval_kwargs):
        """Retrieve time series data.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function
        """

        sensor_id = self.sensor_id

        # Make start and end timezone aware and truncate time values
        today = time.strftime("%Y-%m-%d")
        tomorrow_date = pd.to_datetime(today, format="%Y-%m-%d",
                                       utc=True).normalize() + \
                        pd.Timedelta(days=1)
        tomorrow = tomorrow_date.strftime("%Y-%m-%d")

        # download the data
        url = (API_ENDPOINTS["data pattern"].format(time_series_id=sensor_id,
                                                    start=today,
                                                    end=tomorrow))

        filename = (
            "irceline_{time_series_id}_{start_date}_{end_date}.json".format(
                time_series_id=sensor_id, start_date=today, end_date=tomorrow))
        filepath = os.path.join(cache_dir, filename)

        # Retrieve and parse data
        data = retrieve(filepath, url, "IRCELINE timeseries data",
                        **retrieval_kwargs)
        data = pd.DataFrame.from_dict(data.loc[0, "values"])
        data = data[data['value'] != "NaN"]
        data = data.tail(1)
        last_measurement = data['value'].iloc[0]

        return last_measurement

Example #7

0

Show file

    def get_measurements(self, start_date, end_date, **retrieval_kwargs):
        """Retrieve time series data.

        Args:
            start_date: date string in ISO 8601 (YYYY-MM-DD) format.
                Interpreted as UTC.
            end_date: date string like start_date. If the current date
                or a future date is entered, end will be truncated so
                that only complete days are downloaded.
            retrieval_kwargs: keyword arguments to pass to retrieve
                function

        Raises:
            ValueError if start_date is later than end_date
        """

        # Make start and end timezone aware and truncate time values
        query_start_date = pd.to_datetime(start_date,
                                          format="%Y-%m-%d",
                                          utc=True).normalize()
        query_end_date = (
            pd.to_datetime(end_date, format="%Y-%m-%d", utc=True).normalize() +
            pd.Timedelta(days=1))  # To include end_date data

        # Check validity of input and truncate end date if needed
        today = pd.to_datetime("today", utc=True)
        if query_end_date > today:
            warnings.warn("Resetting end_date to yesterday")
            yesterday = today - pd.Timedelta(days=1)
            end_date = yesterday.strftime("%Y-%m-%d")
            query_end_date = today  # 00:00, to include yesterday's data
        if query_start_date > query_end_date:
            raise ValueError("end_date must be greater than or equal to "
                             "start_date")

        # IRCELINE API takes local times. Convert start and end accordingly.
        query_start_local = query_start_date.tz_convert("Europe/Brussels")
        query_start_local_str = query_start_local.strftime("%Y-%m-%dT%H")
        query_end_local = query_end_date.tz_convert("Europe/Brussels")
        query_end_local -= pd.Timedelta(1, "s")
        query_end_local_str = query_end_local.strftime("%Y-%m-%dT%H:%M:%S")

        url = (API_ENDPOINTS["data pattern"].format(
            time_series_id=self.sensor_id,
            start=query_start_local_str,
            end=query_end_local_str))

        # TODO: Split response into days and cache as daily files; check cache
        #       day by day. Find longest missing intervals to make as few
        #       requests as possible.
        filename = (
            "irceline_{time_series_id}_{start_date}_{end_date}.json".format(
                time_series_id=self.sensor_id,
                start_date=start_date,
                end_date=end_date))
        filepath = os.path.join(cache_dir, filename)

        # TODO: Check day by day if data are cached
        # Retrieve and parse data
        data = retrieve(filepath, url, "IRCELINE timeseries data",
                        **retrieval_kwargs)
        data = pd.DataFrame.from_dict(data.loc[0, "values"])
        if len(data) == 0:
            return
        data["value"] = data["value"].astype("float")
        data = data.rename(columns={"value": self.metadata["phenomenon"]})

        # Convert Unix timestamps to datetimes and then to periods for index
        data.index = (pd.to_datetime(data["timestamp"], unit="ms",
                                     utc=True).dt.to_period(freq="h"))
        data.index.name = "Period"
        data = data.drop(columns=["timestamp"])

        self.measurements = data

Example #8

0

Show file

    def get_measurements(self, start_date, end_date, **retrieval_kwargs):
        """Get measurement data of the sensor in a given period.

        Data are read from cache if available, or downloaded from
        luftdaten.info and saved to cache as retrieved, and then
        cleaned for self.measurements. If the instance already has data
        associated with it, calling this method replaces them.

        Args:
            start_date: first date of data to retrieve, in ISO 8601
                (YYYY-MM-DD) format
            end_date: last date of data to retrieve, in ISO 8601
                (YYYY-MM-DD) format
            retrieval_kwargs: keyword arguments to pass to retrieve
                function
        """
        sid = self.sensor_id
        if self.sensor_type is None:
            self.sensor_type = input("Type of sensor {} has not been set yet. "
                                     "Enter sensor type: ".format(sid))
        stype = self.sensor_type.lower()

        # Get and process the data file for each date in the requested range
        daily_data = []
        for date in pd.date_range(start_date, end_date):
            date_iso = date.strftime("%Y-%m-%d")
            filename = ARCHIVE_FILENAME_PATTERN.format(date=date_iso,
                                                       sensor_type=stype,
                                                       sensor_id=sid)
            filepath = os.path.join(cache_dir, filename)
            url = ARCHIVE_URL_PATTERN.format(date=date_iso, filename=filename)
            data = retrieve(
                cache_file=filepath,
                url=url,
                label=("luftdaten.info data for sensor {} on {}".format(
                    sid, date_iso)),
                read_func=pd.read_csv,
                call_rate_limiter=call_rate_limiter,
                read_func_kwargs={"sep": ";"},
                **retrieval_kwargs)
            if data is None:
                continue

            # Parse timestamps and make them timezone aware
            timestamps = pd.to_datetime(data["timestamp"], utc=True)

            # Reformat data according to sensor type
            data.set_index(timestamps, inplace=True)
            if self.sensor_type in ("SDS011", "HPM"):
                data = (data[["P1", "P2"]].rename(columns={
                    "P1": "pm10",
                    "P2": "pm2.5"
                }))
            elif self.sensor_type == "DHT22":
                data = data[["temperature", "humidity"]]
            else:
                raise NotImplementedError("No data parsing method implemented "
                                          "for sensor type {}".format(
                                              self.sensor_type))

            daily_data.append(data)

        # If daily data were retrieved, concatenate them to a single dataframe
        if daily_data:
            self.measurements = pd.concat(daily_data)
        else:
            self.measurements = None
            print("No data for sensor", sid)
            return

        # Remove duplicates
        duplicates = self.measurements.index.duplicated(keep="last")
        self.measurements = self.measurements[~duplicates]

        self.measurements.sort_index(inplace=True)
        self.clean_measurements()