Ejemplo n.º 1
0
def test_get_history_with_single_match(test_session):
    obs_tuple = Row(
        time=datetime.now(),
        val=123,
        variable_name="relative_humidity",
        unit="percent",
        network_name="MoTIe",
        station_id="11091",
        lat=None,
        lon=None,
    )

    history = get_history(
        test_session,
        obs_tuple.network_name,
        obs_tuple.station_id,
        obs_tuple.lat,
        obs_tuple.lon,
    )
    assert history is not None

    q = test_session.query(Station)
    assert q.count() == 6

    q = test_session.query(History)
    assert q.count() == 8
Ejemplo n.º 2
0
def test_get_history_with_no_matches(test_session):
    # this observation will not match any in test session
    obs_tuple = Row(
        time=datetime.now(),
        val=123,
        variable_name="relative_humidity",
        unit="percent",
        network_name="FLNRO-WMB",
        station_id="666",
        lat=None,
        lon=None,
    )

    history = get_history(
        test_session,
        obs_tuple.network_name,
        obs_tuple.station_id,
        obs_tuple.lat,
        obs_tuple.lon,
    )
    assert history is not None

    q = test_session.query(Station)
    assert q.count() == 7

    q = test_session.query(History)
    assert q.count() == 9
Ejemplo n.º 3
0
def test_diagnostic(crmp_session, diag, count):
    dt = datetime.now()
    rows = (
        (dt, 0, "test_var_a", "degrees", "MoTIe", "noname", 40, -120),
        (dt, 0, "test_var_a", "degrees", "MoTIe", "noname", 40, -120),
        (dt, 0, "test_var_b", "mm", "MoTIe", "noname", 40, -120),
    )
    rows = (Row(*x) for x in rows)
    crmp_session.add(Network(name="MoTIe"))
    infer(crmp_session, rows, diag)
    q = crmp_session.query(Variable).filter(Variable.name.like("test_var%"))
    assert q.count() == count
Ejemplo n.º 4
0
def test_get_history_with_multiple_matches_and_location(test_session):
    obs_tuple = Row(
        time=datetime.now(),
        val=123,
        variable_name="relative_humidity",
        unit="percent",
        network_name="EC_raw",
        station_id="1047172",
        lat=49.45,
        lon=-123.7,
    )

    history = get_history(
        test_session,
        obs_tuple.network_name,
        obs_tuple.station_id,
        obs_tuple.lat,
        obs_tuple.lon,
    )
    assert history.id == 20
Ejemplo n.º 5
0
def normalize(stream):
    log.info("Starting CRD data normalization")

    tz = pytz.timezone("Canada/Pacific")

    data = json.load(stream)

    units = data["HEADER"]["_units"]
    var_names = [unit.replace("Unit", "") for unit in units.keys()]
    log.debug("Found variables %s", var_names)

    for record in data["DATA"]:

        # Timezone information isn't provided by CRD, but the
        # observations appear to be in local time. The max time
        # value found in a request is the most recent hour local
        # time. Hopefully assuming this will suffice.
        date = datetime.strptime(record["DateTimeString"], "%Y%m%d%H%M%S")
        date = tz.localize(date).astimezone(pytz.utc)

        for var_name in var_names:

            # CRD uses -9999 and null for missing values. Skip these.
            # See page 2 here: https://tinyurl.com/quczs93
            val = record[var_name]
            if val is None or val == -9999:
                continue

            yield Row(
                time=date,
                val=val,
                variable_name=var_name,
                unit=units[f"{var_name}Unit"],
                network_name="CRD",
                station_id=record["StationName"],
                lat=None,
                lon=None,
            )
Ejemplo n.º 6
0
    variable = get_variable(test_session, network_name, variable_name)
    check_val = unit_check(val, unit, variable.unit)
    assert check_val == expected


@pytest.mark.parametrize(
    ("obs_tuple", "expected_hid", "expected_time", "expeceted_vid",
     "expected_datum"),
    [
        # use match_station_with_active to match
        (
            Row(
                time=datetime(2012, 9, 26, 18),
                val=123,
                variable_name="precipitation",
                unit="mm",
                network_name="EC_raw",
                station_id="1047172",
                lat=None,
                lon=None,
            ),
            21,
            datetime(2012, 9, 26, 18),
            2,
            123,
        ),
        # use unit_db_check to convert units
        (
            Row(
                time=datetime(2012, 9, 26, 18),
                val=10,
                variable_name="precipitation",
Ejemplo n.º 7
0
def normalize(file_stream):
    log.info("Starting WMB data normalization")

    def clean_row(row):
        return row.strip().replace('"', "").split(",")

    # set variable names using first row in file stream
    var_names = []
    for first_row in file_stream:
        first_row = first_row.decode("utf-8")
        for var in clean_row(first_row):
            var_names.append(var)
        break

    for row in file_stream:
        row = row.decode("utf-8")
        # assign variable name to value
        data = [(var_name, value) for var_name, value in zip(var_names, clean_row(row))]

        # extract station_id and weather_date from list
        _, station_id = data.pop(0)
        _, weather_date = data.pop(0)

        tz = pytz.timezone("Canada/Pacific")
        # The date's provided are in 1-24 hour format *roll*
        hour = int(weather_date[-2:]) - 1
        weather_date = weather_date[:-2] + str(hour)
        try:
            # Timezone information isn't provided by WMB, but the
            # observations appear to be in local time. The max time
            # value found in a request is the most recent hour local
            # time. Hopefully assuming this will suffice.
            date = datetime.strptime(weather_date, "%Y%m%d%H")
            date = tz.localize(date).astimezone(pytz.utc)
        except ValueError:
            log.error("Unable to convert date", extra={"date": weather_date})
            continue

        for pair in data:
            var_name, value = pair

            # skip if value string is empty
            if not value:
                continue

            try:
                value = float(value)
            except ValueError:
                log.error("Unable to convert val to float", extra={"value": value})
                continue

            yield Row(
                time=date,
                val=value,
                variable_name=var_name,
                unit=None,
                network_name="FLNRO-WMB",
                station_id=station_id,
                lat=None,
                lon=None,
            )
Ejemplo n.º 8
0
def normalize(file_stream):
    log.info("Starting WAMR data normalization")

    string_stream = io.StringIO(file_stream.read().decode("utf-8"))
    reader = csv.DictReader(string_stream)
    for row in reader:
        keys_of_interest = (
            "DATE_PST",
            "EMS_ID",
            "STATION_NAME",
            "UNIT",
            "UNITS",
            "PARAMETER",
            "REPORTED_VALUE",
            "RAW_VALUE",
            "LONGITUDE",
            "LATITUDE",
        )
        (
            time,
            ems_id,
            station_name,
            unit,
            units,
            variable_name,
            rep_val,
            raw_val,
            lon,
            lat,
        ) = (row[k] if k in row else None for k in keys_of_interest)

        # Circa 2020, BC ENV is presenting inconsistent names for
        # several of their columns (UNIT/UNITS, EMS_ID/STATION_NAME,
        # REPORTED_VALUE/RAW_VALUE/ROUNDED_VALUE. Ensure that we have
        # at least one of these sets.
        unit = get_one_of((unit, units))
        reported_station_id = get_one_of((ems_id, station_name))
        try:
            val = get_one_of((rep_val, raw_val))
        except ValueError:
            # skip over empty values
            continue

        try:
            value = float(val)
        except ValueError:
            log.error("Unable to convert val to float", extra={"value": val})
            continue

        try:
            tz = pytz.timezone("Canada/Pacific")
            # Timezone information is not available from the text
            # string provided. However, the date field in WAMR's feed
            # is always titled "DATE_PST" (even during times of
            # DST). There's not really enough information available
            # from the network, so we'll have to assume that this
            # covers it.
            dt = tz.localize(parse(time)).astimezone(pytz.utc)
        except ValueError:
            log.error("Unable to convert date string to datetime", extra={"time": time})
            continue

        substitutions = [("% RH", "%"), ("\u00b0C", "celsius"), ("mb", "millibar")]
        for src, dest in substitutions:
            if unit == src:
                unit = re.sub(src, dest, unit)

        # There is a set of Metro Vancouver's stations that are being
        # delivered to us by WAMR, but it is desired that they are re-
        # associated with the correct network. Attempting this by altering the
        # normalization to the correct station_id and the correct network
        # name. Issue here is that the metrovan variables need to match the ENV-AQN
        # variables. Will work on that in the database.

        with resource_stream("crmprtd", "wamr/station_substitutions.yaml") as f:
            substitutions = yaml.safe_load(f)

        if reported_station_id in substitutions:
            station_id = substitutions[reported_station_id]
            network_name = "MVan"
        else:
            station_id = reported_station_id
            network_name = "ENV-AQN"

        yield Row(
            time=dt,
            val=value,
            variable_name=variable_name,
            unit=unit,
            network_name=network_name,
            station_id=station_id,
            lat=lat,
            lon=lon,
        )
Ejemplo n.º 9
0
def normalize(file_stream):
    log.info("Starting MOTI data normalization")
    et = xmlparse(file_stream)
    et = transform(et)
    obs_series = et.xpath("//observation-series")
    for series in obs_series:
        if not len(series):
            log.warning("Empty observation series: xpath search "
                        "'//observation-series' return no results")
            continue
        try:
            stn_id = series.xpath(
                "./origin/id[@type='client']")[0].text.strip()
        except IndexError as e:
            log.error(
                "Could not detect the station id: xpath search "
                "'//observation-series/origin/id[@type='client']' "
                "return no results",
                extra={"exception": e},
            )
            continue

        members = series.xpath("./observation", namespaces=ns)
        for member in members:
            # get time and convert to datetime
            time = member.get("valid-time")
            if not time:
                log.warning("Could not find a valid-time attribute for this "
                            "observation")
                continue

            try:
                # MoTI gives us an ISO formatted time string with
                # timezone info attached so it should be sufficient to
                # simply parse it and display it as UTC.
                date = dateparse(time).astimezone(pytz.utc)
            except ValueError as e:
                log.warning("Unable to convert value to datetime",
                            extra={"time": time})
                continue

            for obs in member.iterchildren():
                variable_name = obs.get("type")
                if variable_name is None:
                    continue

                try:
                    value_element = obs.xpath("./value")[0]
                except IndexError as e:
                    log.warning(
                        "Could not find the actual value for "
                        "observation. xpath search './value' "
                        "returned no results",
                        extra={"variable_name": variable_name},
                    )
                    continue

                try:
                    value = float(value_element.text)
                except ValueError:
                    log.error(
                        "Could not convert value to a number. "
                        "Skipping this observation.",
                        extra={"value": value_element},
                    )
                    continue

                yield Row(
                    time=date,
                    val=value,
                    variable_name=variable_name,
                    unit=value_element.get("units"),
                    network_name="MoTIe",
                    station_id=stn_id,
                    lat=None,
                    lon=None,
                )
Ejemplo n.º 10
0
def normalize_xml(
    file_stream,
    network_name,
    station_id_attr="climate_station_number",
    station_id_xform=identity,
):
    et = parse_xml(file_stream)

    members = et.xpath("//om:member", namespaces=ns)
    log.info("Starting %s data normalization", network_name)

    for member in members:
        om = OmMember(member)
        vars = om.observed_vars()

        for var in vars:
            try:
                ele = om.member.xpath(
                    "./om:Observation/om:result//"
                    "{}[@name='{}']".format(no_ns_element("element"), var),
                    namespaces=ns,
                )[0]
                val = ele.get("value")
                # Ignore missing values. We don't record them.
                if val == "MSNG":
                    log.debug("Ignoring missing obs with value 'MSNG'")
                    continue
                val = float(val)
            # This shouldn't ever be empty based on our xpath for selecting
            # elements, however it could be non-numeric and
            # still be valid XML
            except ValueError as e:
                log.error("Unable to convert value",
                          extra={"val": (ele.get("value"))})
                continue

            try:
                log.debug("Finding Station attributes")
                station_id = member.xpath(
                    ".//{}/{}[@name='{}']".format(
                        no_ns_element("identification-elements"),
                        no_ns_element("element"),
                        station_id_attr,
                    ),
                    namespaces=ns,
                )[0].get("value")
                station_id = station_id_xform(station_id)

                lat, lon = map(
                    float,
                    member.xpath(".//gml:pos", namespaces=ns)[0].text.split())
                obs_time = member.xpath(
                    "./om:Observation/om:samplingTime//gml:timePosition",
                    namespaces=ns)[0].text
                log.debug(
                    "Found station info",
                    extra={
                        "station_id": station_id,
                        "lon": lon,
                        "lat": lat,
                        "time": obs_time,
                    },
                )
            # An IndexError here means that the member has no station_name or
            # climate_station_number (or identification-elements), lat/lon,
            # or obs_time in which case we don't need to process this item
            except IndexError:
                log.warning("This member does not appear to be a station")
                continue

            try:
                date = dateparse(obs_time).astimezone(pytz.utc)
            except ValueError as e:
                log.error("Unable to parse date", extra={"exception": e})
                continue

            yield Row(
                time=date,
                val=val,
                variable_name=var,
                unit=om.member_unit(var),
                network_name=network_name,
                station_id=station_id,
                lat=lat,
                lon=lon,
            )