def test_join_returns_the_same_clusters(): """ Test whether joining to another table for which the start and stop time are the same yields the same clusters. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() es = EventScore( start="2016-01-01", stop="2016-01-04", spatial_unit=make_spatial_unit("versioned-site"), ) joined = ( hartigan.join_to_cluster_components(es).to_geopandas().sort_values( ["subscriber", "rank", "calldays"])) joined.reset_index(inplace=True, drop=True) har_df.sort_values(["subscriber", "rank", "calldays"], inplace=True) har_df.reset_index(inplace=True, drop=True) cols = ["subscriber", "geometry", "rank", "calldays"] compare = joined[cols] == har_df[cols] assert all(compare.all())
def test_joined_hartigan_column_names(): """Test that Hartigan has correct column_names property.""" cd = CallDays("2016-01-01", "2016-01-04", level="versioned-site") hartigan = HartiganCluster(cd, 50) es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") joined = hartigan.join_to_cluster_components(es) assert joined.head(0).columns.tolist() == joined.column_names
def test_joined_hartigan_type_error(): """Test that joining hartigan to something which isn't query like raises a type error.""" cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) hartigan = HartiganCluster(calldays=cd, radius=50) with pytest.raises(TypeError): hartigan.join_to_cluster_components("banana")
def test_joined_hartigan_column_names(get_column_names_from_run): """Test that Hartigan has correct column_names property.""" cd = CallDays( subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site")) hartigan = HartiganCluster(calldays=cd, radius=50) es = EventScore(start="2016-01-01", stop="2016-01-05", level="versioned-site") joined = hartigan.join_to_cluster_components(es) assert get_column_names_from_run(joined) == joined.column_names
def test_meaningful_locations_od_results(get_dataframe): """ Test that OD on MeaningfulLocations returns expected results and counts clusters per subscriber correctly. """ # FIXME: Because of the nature of the test data, we can't actually test much for most admin levels because # the counts will always be below 15, and hence get redacted! mfl_a = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="unknown", ) mfl_b = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="unknown", ) mfl_od = MeaningfulLocationsOD( meaningful_locations_a=mfl_a, meaningful_locations_b=mfl_b, spatial_unit=make_spatial_unit("admin", level=1), ) mfl_od_df = get_dataframe(mfl_od) # Aggregate should not include any counts below 15 assert all(mfl_od_df.total > 15) # Smoke test one admin1 region gets the expected result assert mfl_od_df[(mfl_od_df.pcod_from == "524 1") & ( mfl_od_df.pcod_to == "524 4")].total[0] == pytest.approx(16.490_807) assert mfl_od_df.total.sum() == pytest.approx(350.806_012)
def test_meaningful_locations_od_results(get_dataframe, meaningful_locations_labels): """ Test that OD on MeaningfulLocations returns expected results and counts clusters per subscriber correctly. """ mfl_a = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="unknown", ) mfl_b = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="unknown", ) mfl_od = MeaningfulLocationsOD( meaningful_locations_a=mfl_a, meaningful_locations_b=mfl_b, spatial_unit=make_spatial_unit("admin", level=1), ) mfl_od_df = get_dataframe(mfl_od) # Smoke test one admin1 region gets the expected result regional_flow = mfl_od_df[(mfl_od_df.pcod_from == "524 1") & (mfl_od_df.pcod_to == "524 4")].value.tolist()[0] assert regional_flow == pytest.approx(16.490_807) assert mfl_od_df.value.sum() == pytest.approx(454.0)
def test_joined_hartigan_cluster_bad_query_column_names_raises_error(): """ Test that joining a HartiganCluster to a query without 'site_id' and 'version' columns raises an error. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) hartigan = HartiganCluster(calldays=cd, radius=50) es = EventScore(start="2016-01-01", stop="2016-01-04", spatial_unit=make_spatial_unit("lon-lat")) with pytest.raises(ValueError): hartigan.join_to_cluster_components(es)
def test_column_names_meaningful_locations_od(exemplar_spatial_unit_param, get_column_names_from_run, meaningful_locations_labels): """Test that column_names property matches head(0) for an od matrix between meaningful locations""" if not exemplar_spatial_unit_param.is_polygon: pytest.xfail( f"The spatial unit {exemplar_spatial_unit_param} is not supported as an aggregation unit for ODs between MeaningfulLocations." ) mfl_a = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="evening", ) mfl_b = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="unknown", ) mfl_od = MeaningfulLocationsOD( meaningful_locations_a=mfl_a, meaningful_locations_b=mfl_b, spatial_unit=exemplar_spatial_unit_param, ) assert get_column_names_from_run(mfl_od) == mfl_od.column_names
def test_meaningful_locations_od_redaction(get_dataframe, meaningful_locations_labels): """ Test that OD on MeaningfulLocations is redacted to >15. """ mfl_a = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="unknown", ) mfl_b = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-02", stop="2016-01-03", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="unknown", ) mfl_od = RedactedMeaningfulLocationsOD( meaningful_locations_od=MeaningfulLocationsOD( meaningful_locations_a=mfl_a, meaningful_locations_b=mfl_b, spatial_unit=make_spatial_unit("admin", level=1), )) mfl_od_df = get_dataframe(mfl_od) # Aggregate should not include any counts below 15 assert all(mfl_od_df.value > 15)
def test_call_threshold_works(get_dataframe): """ Test whether a call threshold above 1 limits the number of clusters. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() assert any(har_df.calldays == 1) har_df_higher_call_threshold = get_dataframe( HartiganCluster(calldays=cd, radius=50, call_threshold=2)) assert len(har_df) > len(har_df_higher_call_threshold)
def test_cluster_is_within_envelope(get_dataframe): """ Test that all the clusters are within the enveloped formed by all the towers in the cluster. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() sites = Sites().to_geopandas().set_index(["site_id", "version"]) towers = GeoSeries(har_df.apply(lambda x: get_geom_point(x, sites), 1)) s = har_df.intersects(towers) assert all(s)
def test_meaningful_locations_od_raises_for_bad_spatial_unit( exemplar_spatial_unit_param, get_dataframe): """ Test that od on meaningful locations raises an InvalidSpatialUnitError for a bad spatial unit. """ mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="evening", ) with pytest.raises(InvalidSpatialUnitError): mfl_od = MeaningfulLocationsOD( meaningful_locations_a=mfl, meaningful_locations_b=mfl, spatial_unit=make_spatial_unit("lon-lat"), )
def test_call_threshold_works(self): """ Test whether a call threshold above 1 limits the number of clusters. """ self.assertTrue(any(self.har_df.calldays == 1)) har = HartiganCluster(self.cd, 50, call_threshold=2).get_dataframe() self.assertFalse(all(self.har_df.calldays > 1))
def test_meaningful_locations_aggregate_disallowed_spatial_unit_raises( meaningful_locations_labels, ): """Test that a bad spatial unit raises an InvalidSpatialUnitError""" with pytest.raises(InvalidSpatialUnitError): mfl_agg = MeaningfulLocationsAggregate( meaningful_locations=MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="evening", ), spatial_unit=make_spatial_unit("lon-lat"), )
def test_meaningful_locations_results(label, expected_number_of_clusters, get_dataframe): """ Test that MeaningfulLocations returns expected results and counts clusters per subscriber correctly. """ mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label=label, ) mfl_df = get_dataframe(mfl) assert len(mfl_df) == expected_number_of_clusters count_clusters = mfl_df.groupby(["subscriber", "label", "n_clusters"], as_index=False).count() # Check that query has correctly counted the number of clusters per subscriber assert all(count_clusters.n_clusters == count_clusters.cluster)
def test_meaningful_locations_aggregation_results(exemplar_level_param, get_dataframe): """ Test that aggregating MeaningfulLocations returns expected results and redacts values below 15. """ if exemplar_level_param[ "level"] not in MeaningfulLocationsAggregate.allowed_levels: pytest.xfail( f'The level "{exemplar_level_param["level"]}" is not supported as an aggregation unit for MeaningfulLocations.' ) mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", level="versioned-site")), radius=1, ), scores=EventScore(start="2016-01-01", stop="2016-01-02", level="versioned-site"), labels=labels, label="evening", ) mfl_agg = MeaningfulLocationsAggregate(meaningful_locations=mfl, **exemplar_level_param) mfl_df = get_dataframe(mfl) mfl_agg_df = get_dataframe(mfl_agg) # Aggregate should not include any counts below 15 assert all(mfl_agg_df.total > 15) # Sum of aggregate should be less than the number of unique subscribers assert mfl_agg_df.total.sum() < mfl_df.subscriber.nunique()
def test_column_names_meaningful_locations_aggregate( exemplar_level_param, get_column_names_from_run): """ Test that column_names property matches head(0) for aggregated meaningful locations""" if exemplar_level_param[ "level"] not in MeaningfulLocationsAggregate.allowed_levels: pytest.xfail( f'The level "{exemplar_level_param["level"]}" is not supported as an aggregation unit for MeaningfulLocations.' ) mfl_agg = MeaningfulLocationsAggregate( meaningful_locations=MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", level="versioned-site")), radius=1, ), scores=EventScore(start="2016-01-01", stop="2016-01-02", level="versioned-site"), labels=labels, label="evening", ), **exemplar_level_param, ) assert get_column_names_from_run(mfl_agg) == mfl_agg.column_names
def test_different_call_days_format(self): """ Test whether we can pass different call days format such as table name, SQL query and CallDays class. """ cd = CallDays("2016-01-01", "2016-01-04", level="versioned-site") har = HartiganCluster(cd, 50).get_dataframe() self.assertIsInstance(har, pd.DataFrame) cd.store().result() har = HartiganCluster(Table(cd.table_name), 50).get_dataframe() self.assertIsInstance(har, pd.DataFrame) cd_query = cd.get_query() har = HartiganCluster(CustomQuery(cd_query), 50).get_dataframe() self.assertIsInstance(har, pd.DataFrame)
def test_meaningful_locations_aggregation_results(exemplar_spatial_unit_param, get_dataframe): """ Test that aggregating MeaningfulLocations returns expected results and redacts values below 15. """ if not exemplar_spatial_unit_param.is_polygon: pytest.xfail( f"The spatial unit {exemplar_spatial_unit_param} is not supported as an aggregation unit for MeaningfulLocations." ) mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="evening", ) mfl_agg = MeaningfulLocationsAggregate( meaningful_locations=mfl, spatial_unit=exemplar_spatial_unit_param) mfl_df = get_dataframe(mfl) mfl_agg_df = get_dataframe(mfl_agg) # Aggregate should not include any counts below 15 assert all(mfl_agg_df.total > 15) # Sum of aggregate should be less than the number of unique subscribers assert mfl_agg_df.total.sum() < mfl_df.subscriber.nunique()
def test_meaningful_locations_aggregation_results(exemplar_spatial_unit_param, get_dataframe, meaningful_locations_labels): """ Test that aggregating MeaningfulLocations returns expected results and redacts values below 15. """ mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=meaningful_locations_labels, label="evening", ) mfl_agg = RedactedMeaningfulLocationsAggregate( meaningful_locations_aggregate=MeaningfulLocationsAggregate( meaningful_locations=mfl, spatial_unit=make_spatial_unit("admin", level=3))) mfl_agg_df = get_dataframe(mfl_agg) # Aggregate should not include any counts below 15 assert all(mfl_agg_df.value > 15)
def test_hartigan_column_names(get_column_names_from_run): """Test that Hartigan has correct column_names property.""" cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) hartigan = HartiganCluster(calldays=cd, radius=50) assert get_column_names_from_run(hartigan) == hartigan.column_names
def test_bigger_radius_yields_fewer_clusters(get_dataframe): """ Test whether bigger radius yields fewer clusters per subscriber """ radius = [1, 2, 5, 10, 50] cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) h = get_dataframe(HartiganCluster(calldays=cd, radius=radius[0])) nclusters_small_radius = h.groupby("subscriber").size() for r in radius[1:]: h = get_dataframe(HartiganCluster(calldays=cd, radius=r)) nclusters_big_radius = h.groupby("subscriber").size() assert all(nclusters_small_radius >= nclusters_big_radius) nclusters_small_radius = nclusters_big_radius
def test_hartigan_cluster_bad_calldays_column_names_raises_error(): """ Test that using calldays without 'site_id' and 'version' columns raises an error. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("lon-lat"))) with pytest.raises(ValueError): HartiganCluster(calldays=cd, radius=50)
def test_buffered_hartigan(self): """ Test whether Hartigan produces buffered clusters when buffer is larger than 0. """ har = HartiganCluster(self.cd, 50, buffer=2).to_geopandas() areas = har.geometry.area # since the mock data does not have geom_area in the site table we either # get the clusters with area equivalent to 2km2 (areas below are in degrees) or None. min_area = areas.min() max_area = areas.max() self.assertAlmostEquals(min_area, 0.001, 3) self.assertAlmostEquals(max_area, 0.001, 3)
def test_different_call_days_format(get_dataframe): """ Test whether we can pass different call days format such as table name, SQL query and CallDays class. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) har = get_dataframe(HartiganCluster(calldays=cd, radius=50)) assert isinstance(har, pd.DataFrame) cd.store().result() har = get_dataframe( HartiganCluster(calldays=Table(cd.fully_qualified_table_name), radius=50)) assert isinstance(har, pd.DataFrame) cd_query = cd.get_query() har = get_dataframe( HartiganCluster(calldays=CustomQuery(cd_query, cd.column_names), radius=50)) assert isinstance(har, pd.DataFrame)
def test_all_options_hartigan(): """ Test whether Hartigan works when changing all options. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) har = HartiganCluster(calldays=cd, radius=50, buffer=2, call_threshold=2).to_geopandas() assert set(har.columns) == set([ "subscriber", "geometry", "rank", "calldays", "site_id", "version", "centroid" ])
def test_first_call_day_in_first_cluster(get_dataframe): """ Test that the first ranked call day of each subscriber is in the first cluster of each subscriber. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) cd_df = get_dataframe(cd) hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() cd_first = cd_df[["subscriber", "site_id", "version"]].groupby("subscriber").first() har_first = (har_df[["subscriber", "site_id", "version"]].groupby("subscriber").first()) joined = cd_first.join(har_first, lsuffix="_cd", rsuffix="_har") s = joined.apply( lambda x: (x.site_id_cd in x.site_id_har) and (x.version_cd in x.version_har), axis=1, ) assert all(s)
def _make_meaningful_locations_object( *, start_date, end_date, label, labels, event_types, subscriber_subset, tower_cluster_call_threshold, tower_cluster_radius, tower_day_of_week_scores, tower_hour_of_day_scores, ): q_subscriber_locations = SubscriberLocations( start=start_date, stop=end_date, spatial_unit=make_spatial_unit( "versioned-site" ), # note this 'spatial_unit' is not the same as the exposed parameter 'aggregation_unit' table=event_types, subscriber_subset=subscriber_subset, ) q_call_days = CallDays(subscriber_locations=q_subscriber_locations) q_hartigan_cluster = HartiganCluster( calldays=q_call_days, radius=tower_cluster_radius, call_threshold=tower_cluster_call_threshold, buffer= 0, # we're not exposing 'buffer', apparently, so we're hard-coding it ) q_event_score = EventScore( start=start_date, stop=end_date, score_hour=tower_hour_of_day_scores, score_dow=tower_day_of_week_scores, spatial_unit=make_spatial_unit( "versioned-site" ), # note this 'spatial_unit' is not the same as the exposed parameter 'aggregation_unit' table=event_types, subscriber_subset=subscriber_subset, ) q_meaningful_locations = MeaningfulLocations(clusters=q_hartigan_cluster, labels=labels, scores=q_event_score, label=label) return q_meaningful_locations
def test_column_names_meaningful_locations(get_column_names_from_run): """ Test that column_names property matches head(0) for meaningfullocations""" mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", level="versioned-site")), radius=1, ), scores=EventScore(start="2016-01-01", stop="2016-01-02", level="versioned-site"), labels=labels, label="evening", ) assert get_column_names_from_run(mfl) == mfl.column_names
def test_buffered_hartigan(): """ Test whether Hartigan produces buffered clusters when buffer is larger than 0. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) har = HartiganCluster(calldays=cd, radius=50, buffer=2).to_geopandas() areas = har.geometry.area # since the mock data does not have geom_area in the site table we either # get the clusters with area equivalent to 2km2 (areas below are in degrees) or None. min_area = areas.min() max_area = areas.max() assert min_area == pytest.approx(0.0011327683603873115) assert max_area == pytest.approx(0.001166624454009738)