Exemple #1
0
def test_invalid_reference_raises_error():
    """
    Test that passing an invalid reference location raises an error.
    """
    with pytest.raises(
            ValueError,
            match=
            "Argument 'reference_location' should be an instance of BaseLocation class or a tuple of two floats. Got: str",
    ):
        DistanceSeries(
            subscriber_locations=SubscriberLocations(
                "2016-01-01",
                "2016-01-07",
                spatial_unit=make_spatial_unit("lon-lat")),
            reference_location="NOT_A_LOCATION",
        )
Exemple #2
0
def test_active_at_reference_location(get_dataframe):
    """
    Values test for active at reference location.
    """
    activity = ActiveAtReferenceLocation(
        subscriber_locations=UniqueLocations(
            SubscriberLocations(
                "2016-01-01",
                "2016-01-02",
                spatial_unit=make_spatial_unit("admin", level=3),
            )),
        reference_locations=daily_location("2016-01-03"),
    )
    df = get_dataframe(activity).set_index("subscriber")
    assert not df.loc["038OVABN11Ak4W5P"][0]
    assert df.loc["09NrjaNNvDanD8pk"][0]
def test_call_days_returns_expected_counts_per_subscriber(get_dataframe):
    """
    Test that the returned data is correct for a given subscriber.
    """
    test_values = (
        ("Z89mWDgZrr3qpnlB", "2016-01-01", "2016-01-03", 9),
        ("Z89mWDgZrr3qpnlB", "2016-01-01", "2016-01-08", 30),
        ("038OVABN11Ak4W5P", "2016-01-01", "2016-01-03", 6),
        ("038OVABN11Ak4W5P", "2016-01-01", "2016-01-08", 32),
    )
    for (subscriber, start, end, calls) in test_values:
        cd = CallDays(
            SubscriberLocations(
                start, end, spatial_unit=make_spatial_unit("versioned-site")))
        df = get_dataframe(cd).query('subscriber == "{}"'.format(subscriber))
        assert df.calldays.sum() == calls
def test_unmoving_at_reference_location_column_names(get_column_names_from_run):
    assert (
        get_column_names_from_run(
            UnmovingAtReferenceLocation(
                locations=UniqueLocations(
                    SubscriberLocations(
                        "2016-01-01",
                        "2016-01-01 10:00",
                        spatial_unit=make_spatial_unit("admin", level=3),
                    )
                ),
                reference_locations=LastLocation("2016-01-01", "2016-01-02"),
            )
        )
        == ["subscriber", "value"]
    )
def test_spatial_unit_mismatch_error():
    with pytest.raises(ValueError, match="Spatial unit mismatch"):
        UnmovingAtReferenceLocation(
            locations=UniqueLocations(
                SubscriberLocations(
                    "2016-01-01",
                    "2016-01-01 10:00",
                    spatial_unit=make_spatial_unit("admin", level=2),
                )
            ),
            reference_locations=LastLocation(
                "2016-01-01",
                "2016-01-02",
                spatial_unit=make_spatial_unit("admin", level=3),
            ),
        )
Exemple #6
0
def test_active_at_reference_location_counts(get_dataframe):
    """
    Values test for active at reference location counts.
    """
    activity = ActiveAtReferenceLocationCounts(
        ActiveAtReferenceLocation(
            subscriber_locations=UniqueLocations(
                SubscriberLocations(
                    "2016-01-01",
                    "2016-01-02",
                    spatial_unit=make_spatial_unit("admin", level=3),
                )),
            reference_locations=daily_location("2016-01-03"),
        ))
    df = get_dataframe(activity).set_index("pcod")
    assert df.loc["524 1 01 04"][0] == 1
def test_bad_window(size, match):
    """
    Test some median unfriendly window sizes raise errors.
    """
    with pytest.raises(ValueError, match=match):
        sl = SubscriberLocations(
            "2016-01-01",
            "2016-01-07",
            spatial_unit=make_spatial_unit("lon-lat"),
            hours=(20, 0),
        )
        ds = DistanceSeries(subscriber_locations=sl, statistic="min")
        IterativeMedianFilter(
            query_to_filter=ImputedDistanceSeries(distance_series=ds),
            filter_window_size=size,
        )
def test_unique_locations(get_dataframe):
    """
    Values test for unique locations.
    """
    unique_locs = UniqueLocations(
        SubscriberLocations(
            "2016-01-01",
            "2016-01-02",
            spatial_unit=make_spatial_unit("admin", level=3),
        ))
    df = get_dataframe(unique_locs).set_index("subscriber")
    assert df.loc["038OVABN11Ak4W5P"].pcod.tolist() == [
        "524 2 04 20",
        "524 3 08 43",
        "524 4 12 62",
        "524 4 12 65",
    ]
def test_column_must_exist(column_arg):
    """
    Check errors for required columns.
    """
    with pytest.raises(ValueError, match=column_arg):
        sl = SubscriberLocations(
            "2016-01-01",
            "2016-01-07",
            spatial_unit=make_spatial_unit("lon-lat"),
            hours=(20, 0),
        )
        ds = DistanceSeries(subscriber_locations=sl, statistic="min")
        IterativeMedianFilter(
            query_to_filter=ImputedDistanceSeries(distance_series=ds),
            filter_window_size=3,
            **{column_arg: "NOT_A_VALID_COLUMN"},
        )
    def _unsampled_query_obj(self):
        """
        Return the underlying flowmachine unique locations object.

        Returns
        -------
        Query
        """
        return UniqueLocations(
            SubscriberLocations(
                self.start_date,
                self.end_date,
                spatial_unit=self.aggregation_unit,
                table=self.event_types,
                subscriber_subset=self.subscriber_subset,
            )
        )
def _make_meaningful_locations_object(
    *,
    start_date,
    end_date,
    label,
    labels,
    event_types,
    subscriber_subset,
    tower_cluster_call_threshold,
    tower_cluster_radius,
    tower_day_of_week_scores,
    tower_hour_of_day_scores,
):
    q_subscriber_locations = SubscriberLocations(
        start=start_date,
        stop=end_date,
        spatial_unit=make_spatial_unit(
            "versioned-site"
        ),  # note this 'spatial_unit' is not the same as the exposed parameter 'aggregation_unit'
        table=event_types,
        subscriber_subset=subscriber_subset,
    )
    q_call_days = CallDays(subscriber_locations=q_subscriber_locations)
    q_hartigan_cluster = HartiganCluster(
        calldays=q_call_days,
        radius=tower_cluster_radius,
        call_threshold=tower_cluster_call_threshold,
        buffer=
        0,  # we're not exposing 'buffer', apparently, so we're hard-coding it
    )
    q_event_score = EventScore(
        start=start_date,
        stop=end_date,
        score_hour=tower_hour_of_day_scores,
        score_dow=tower_day_of_week_scores,
        spatial_unit=make_spatial_unit(
            "versioned-site"
        ),  # note this 'spatial_unit' is not the same as the exposed parameter 'aggregation_unit'
        table=event_types,
        subscriber_subset=subscriber_subset,
    )
    q_meaningful_locations = MeaningfulLocations(clusters=q_hartigan_cluster,
                                                 labels=labels,
                                                 scores=q_event_score,
                                                 label=label)
    return q_meaningful_locations
Exemple #12
0
def test_unmoving_counts_column_names(get_column_names_from_run):
    assert (
        get_column_names_from_run(
            UnmovingCounts(
                Unmoving(
                    locations=UniqueLocations(
                        SubscriberLocations(
                            "2016-01-01",
                            "2016-01-01 10:00",
                            spatial_unit=make_spatial_unit("admin", level=3),
                        )
                    )
                )
            )
        )
        == ["pcod", "value"]
    )
Exemple #13
0
def test_buffered_hartigan():
    """
    Test whether Hartigan produces buffered clusters when buffer is larger than 0.
    """
    cd = CallDays(
        SubscriberLocations("2016-01-01",
                            "2016-01-04",
                            spatial_unit=make_spatial_unit("versioned-site")))

    har = HartiganCluster(calldays=cd, radius=50, buffer=2).to_geopandas()
    areas = har.geometry.area
    # since the mock data does not have geom_area in the site table we either
    # get the clusters with area equivalent to 2km2 (areas below are in degrees) or None.
    min_area = areas.min()
    max_area = areas.max()
    assert min_area == pytest.approx(0.0011327683603873115)
    assert max_area == pytest.approx(0.001166624454009738)
Exemple #14
0
def test_call_days_returns_expected_counts_per_subscriber_tower(get_dataframe):
    """
    Test that the returned data is correct for a given subscriber-location pair.
    """
    test_values = (
        ("Z89mWDgZrr3qpnlB", "m9jL23", "2016-01-01", "2016-01-03", 2),
        ("Z89mWDgZrr3qpnlB", "qvkp6J", "2016-01-01", "2016-01-08", 4),
        ("038OVABN11Ak4W5P", "QeBRM8", "2016-01-01", "2016-01-03", 1),
        ("038OVABN11Ak4W5P", "nWM8R3", "2016-01-01", "2016-01-08", 5),
    )
    for (subscriber, location, start, end, calls) in test_values:
        cd = CallDays(
            SubscriberLocations(
                start, end, spatial_unit=make_spatial_unit("versioned-site")))
        df = get_dataframe(cd).query(
            'subscriber == "{}" & site_id == "{}"'.format(
                subscriber, location))
        assert df.value.values[0] == calls
Exemple #15
0
def test_bigger_radius_yields_fewer_clusters(get_dataframe):
    """
    Test whether bigger radius yields fewer clusters per subscriber
    """
    radius = [1, 2, 5, 10, 50]
    cd = CallDays(
        SubscriberLocations("2016-01-01",
                            "2016-01-04",
                            spatial_unit=make_spatial_unit("versioned-site")))

    h = get_dataframe(HartiganCluster(calldays=cd, radius=radius[0]))
    nclusters_small_radius = h.groupby("subscriber").size()

    for r in radius[1:]:
        h = get_dataframe(HartiganCluster(calldays=cd, radius=r))
        nclusters_big_radius = h.groupby("subscriber").size()
        assert all(nclusters_small_radius >= nclusters_big_radius)
        nclusters_small_radius = nclusters_big_radius
Exemple #16
0
def test_returns_expected_values_fixed_point(stat, sub_a_expected,
                                             sub_b_expected, get_dataframe):
    """
    Test that we get expected return values for the various statistics with 0, 0 reference
    """
    sub_a_id, sub_b_id = "j6QYNbMJgAwlVORP", "NG1km5NzBg5JD8nj"
    df = get_dataframe(
        DistanceSeries(
            subscriber_locations=SubscriberLocations(
                "2016-01-01",
                "2016-01-07",
                spatial_unit=make_spatial_unit("lon-lat")),
            statistic=stat,
        )).set_index(["subscriber", "datetime"])
    assert df.loc[(sub_a_id, date(2016, 1,
                                  1))].value == pytest.approx(sub_a_expected)
    assert df.loc[(sub_b_id, date(2016, 1,
                                  6))].value == pytest.approx(sub_b_expected)
Exemple #17
0
def test_unmoving_at_reference_location_counts_values(get_dataframe):
    df = get_dataframe(
        UnmovingAtReferenceLocationCounts(
            UnmovingAtReferenceLocation(
                locations=UniqueLocations(
                    SubscriberLocations(
                        "2016-01-01",
                        "2016-01-01 10:00",
                        spatial_unit=make_spatial_unit("admin", level=1),
                    )),
                reference_locations=LastLocation(
                    "2016-01-01",
                    "2016-01-02",
                    spatial_unit=make_spatial_unit("admin", level=1),
                ),
            ))).set_index("pcod")
    assert df.loc["524 1"].value == 2
    assert df.loc["524 4"].value == 26
def test_redacted_active_at_reference_location_counts(get_dataframe):
    """
    Values test for redacted active at reference location counts.
    """
    activity = RedactedActiveAtReferenceLocationCounts(
        active_at_reference_location_counts=ActiveAtReferenceLocationCounts(
            ActiveAtReferenceLocation(
                subscriber_locations=UniqueLocations(
                    SubscriberLocations(
                        "2016-01-01",
                        "2016-01-02",
                        spatial_unit=make_spatial_unit("admin", level=3),
                    )),
                reference_locations=daily_location("2016-01-03"),
            )))
    df = get_dataframe(activity).set_index("pcod")
    assert all(df.value > 15)
    assert len(df) == 2
    assert df.loc["524 3 08 44"].value == 25
def test_unmoving_at_reference_location_counts_column_names(
        get_column_names_from_run):
    assert get_column_names_from_run(
        RedactedUnmovingAtReferenceLocationCounts(
            unmoving_at_reference_location_counts=
            UnmovingAtReferenceLocationCounts(
                UnmovingAtReferenceLocation(
                    locations=UniqueLocations(
                        SubscriberLocations(
                            "2016-01-01",
                            "2016-01-01 10:00",
                            spatial_unit=make_spatial_unit("admin", level=1),
                        )),
                    reference_locations=LastLocation(
                        "2016-01-01",
                        "2016-01-02",
                        spatial_unit=make_spatial_unit("admin", level=1),
                    ),
                )))) == ["pcod", "value"]
Exemple #20
0
def test_impute(get_dataframe):
    sl = SubscriberLocations(
        "2016-01-01",
        "2016-01-07",
        spatial_unit=make_spatial_unit("lon-lat"),
        hours=(20, 0),
    )
    ds = DistanceSeries(subscriber_locations=sl, statistic="min")
    ds_df = get_dataframe(ds)
    sql = get_dataframe(ImputedDistanceSeries(distance_series=ds))
    all_subs = ds_df.subscriber.drop_duplicates()
    for sub in all_subs:
        print(sub)
        if ds_df[ds_df.subscriber == sub].datetime.nunique() > 3:
            to_be_imputed = ds_df[ds_df.subscriber == sub].sort_values(
                "datetime")
            imputed = fill_in_dates(to_be_imputed, 3, sl.start, sl.stop)
            assert imputed.value.values.tolist() == pytest.approx(
                sql[sql.subscriber == sub].value.tolist())
def test_unique_visitor_counts_column_names(get_column_names_from_run):
    assert (get_column_names_from_run(
        UniqueVisitorCounts(
            ActiveAtReferenceLocationCounts(
                ActiveAtReferenceLocation(
                    subscriber_locations=UniqueLocations(
                        SubscriberLocations(
                            "2016-01-01",
                            "2016-01-02",
                            spatial_unit=make_spatial_unit("admin", level=3),
                        )),
                    reference_locations=daily_location("2016-01-03"),
                )),
            UniqueSubscriberCounts(
                "2016-01-01",
                "2016-01-02",
                spatial_unit=make_spatial_unit("admin", level=3),
            ),
        )) == ["pcod", "value"])
Exemple #22
0
def test_error_on_spatial_unit_mismatch():
    """
    Test that error is raised if the spatial unit of the subscriber locations isn't point.
    """

    rl = daily_location("2016-01-01",
                        spatial_unit=make_spatial_unit("admin", level=3))

    with pytest.raises(
            ValueError,
            match=
            "reference_location must have the same spatial unit as subscriber_locations.",
    ):
        DistanceSeries(
            subscriber_locations=SubscriberLocations(
                "2016-01-01",
                "2016-01-07",
                spatial_unit=make_spatial_unit("lon-lat")),
            reference_location=rl,
        )
def test_column_names_meaningful_locations(get_column_names_from_run):
    """ Test that column_names property matches head(0) for meaningfullocations"""
    mfl = MeaningfulLocations(
        clusters=HartiganCluster(
            calldays=CallDays(subscriber_locations=SubscriberLocations(
                start="2016-01-01",
                stop="2016-01-02",
                spatial_unit=make_spatial_unit("versioned-site"),
            )),
            radius=1,
        ),
        scores=EventScore(
            start="2016-01-01",
            stop="2016-01-02",
            spatial_unit=make_spatial_unit("versioned-site"),
        ),
        labels=labels,
        label="evening",
    )

    assert get_column_names_from_run(mfl) == mfl.column_names
def test_join_with_polygon(get_dataframe, get_length):
    """
    Test that flowmachine.JoinToLocation can get the (arbitrary) polygon
    of each cell.
    """
    ul = SubscriberLocations("2016-01-05",
                             "2016-01-07",
                             spatial_unit=make_spatial_unit("cell"))
    j = JoinToLocation(
        ul,
        spatial_unit=make_spatial_unit(
            "polygon",
            region_id_column_name="admin3pcod",
            geom_table="geography.admin3",
            geom_column="geom",
        ),
    )
    df = get_dataframe(j)

    expected_cols = sorted(["admin3pcod", "location_id", "subscriber", "time"])
    assert sorted(df.columns) == expected_cols
    assert len(df) == get_length(ul)
def test_join_with_versioned_cells(get_dataframe, get_length):
    """
    Test that flowmachine.JoinToLocation can fetch the cell version.
    """
    ul = SubscriberLocations("2016-01-05",
                             "2016-01-07",
                             spatial_unit=make_spatial_unit("cell"))
    df = get_dataframe(
        JoinToLocation(ul, spatial_unit=make_spatial_unit("versioned-cell")))
    # As our database is complete we should not drop any rows
    assert len(df) == get_length(ul)
    # These should all be version zero, these are the towers before the changeover date, or those that
    # have not moved.
    should_be_version_zero = df[(df.time <= move_date) |
                                (~df.location_id.isin(moving_sites))]

    # These should all be one, they are the ones after the change over time that have moved.
    should_be_version_one = df[(df.time > move_date)
                               & (df.location_id.isin(moving_sites))]

    assert (should_be_version_zero.version == 0).all()
    assert (should_be_version_one.version == 1).all()
def test_unique_visitor_counts(get_dataframe):
    """
    Values test for unique visitor counts.
    """
    activity = UniqueVisitorCounts(
        ActiveAtReferenceLocationCounts(
            ActiveAtReferenceLocation(
                subscriber_locations=UniqueLocations(
                    SubscriberLocations(
                        "2016-01-01",
                        "2016-01-02",
                        spatial_unit=make_spatial_unit("admin", level=3),
                    )),
                reference_locations=daily_location("2016-01-03"),
            )),
        UniqueSubscriberCounts("2016-01-01",
                               "2016-01-02",
                               spatial_unit=make_spatial_unit("admin",
                                                              level=3)),
    )
    df = get_dataframe(activity).set_index("pcod")
    assert df.loc["524 1 01 04"].value == 66
    assert df.loc["524 3 08 44"].value == 170
def test_join_with_lon_lat(get_dataframe):
    """
    Test that flowmachine.JoinToLocation can get the lon-lat values of the cell
    """
    ul = SubscriberLocations(
        "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell")
    )
    df = get_dataframe(JoinToLocation(ul, spatial_unit=make_spatial_unit("lon-lat")))

    expected_cols = sorted(["subscriber", "time", "location_id", "lon", "lat"])
    assert sorted(df.columns) == expected_cols
    # Pick out one cell that moves location and assert that the
    # lon-lats are right
    focal_cell = "dJb0Wd"
    lon1, lat1 = (83.09284486, 27.648837800000003)
    lon2, lat2 = (83.25769074752517, 27.661443318109132)
    post_move = df[(df.time > move_date) & (df["location_id"] == focal_cell)]
    pre_move = df[(df.time < move_date) & (df["location_id"] == focal_cell)]
    # And check them all one-by-one
    np.isclose(pre_move.lon, lon1).all()
    np.isclose(pre_move.lat, lat1).all()
    np.isclose(post_move.lon, lon2).all()
    np.isclose(post_move.lat, lat2).all()
Exemple #28
0
def test_different_call_days_format(get_dataframe):
    """
    Test whether we can pass different call days format such as table name, SQL query and CallDays class.
    """
    cd = CallDays(
        SubscriberLocations("2016-01-01",
                            "2016-01-04",
                            spatial_unit=make_spatial_unit("versioned-site")))
    har = get_dataframe(HartiganCluster(calldays=cd, radius=50))
    assert isinstance(har, pd.DataFrame)

    cd.store().result()

    har = get_dataframe(
        HartiganCluster(calldays=Table(cd.fully_qualified_table_name),
                        radius=50))
    assert isinstance(har, pd.DataFrame)

    cd_query = cd.get_query()
    har = get_dataframe(
        HartiganCluster(calldays=CustomQuery(cd_query, cd.column_names),
                        radius=50))
    assert isinstance(har, pd.DataFrame)
def test_meaningful_locations_aggregate_disallowed_spatial_unit_raises():
    """ Test that a bad spatial unit raises an InvalidSpatialUnitError"""

    with pytest.raises(InvalidSpatialUnitError):
        mfl_agg = MeaningfulLocationsAggregate(
            meaningful_locations=MeaningfulLocations(
                clusters=HartiganCluster(
                    calldays=CallDays(subscriber_locations=SubscriberLocations(
                        start="2016-01-01",
                        stop="2016-01-02",
                        spatial_unit=make_spatial_unit("versioned-site"),
                    )),
                    radius=1,
                ),
                scores=EventScore(
                    start="2016-01-01",
                    stop="2016-01-02",
                    spatial_unit=make_spatial_unit("versioned-site"),
                ),
                labels=labels,
                label="evening",
            ),
            spatial_unit=make_spatial_unit("lon-lat"),
        )
Exemple #30
0
def test_first_call_day_in_first_cluster(get_dataframe):
    """
    Test that the first ranked call day of each subscriber is in the first cluster of each subscriber.
    """
    cd = CallDays(
        SubscriberLocations("2016-01-01",
                            "2016-01-04",
                            spatial_unit=make_spatial_unit("versioned-site")))
    cd_df = get_dataframe(cd)

    hartigan = HartiganCluster(calldays=cd, radius=50)
    har_df = hartigan.to_geopandas()
    cd_first = cd_df[["subscriber", "site_id",
                      "version"]].groupby("subscriber").first()
    har_first = (har_df[["subscriber", "site_id",
                         "version"]].groupby("subscriber").first())

    joined = cd_first.join(har_first, lsuffix="_cd", rsuffix="_har")
    s = joined.apply(
        lambda x:
        (x.site_id_cd in x.site_id_har) and (x.version_cd in x.version_har),
        axis=1,
    )
    assert all(s)