def test_join_returns_the_same_clusters(): """ Test whether joining to another table for which the start and stop time are the same yields the same clusters. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() es = EventScore( start="2016-01-01", stop="2016-01-04", spatial_unit=make_spatial_unit("versioned-site"), ) joined = ( hartigan.join_to_cluster_components(es).to_geopandas().sort_values( ["subscriber", "rank", "calldays"])) joined.reset_index(inplace=True, drop=True) har_df.sort_values(["subscriber", "rank", "calldays"], inplace=True) har_df.reset_index(inplace=True, drop=True) cols = ["subscriber", "geometry", "rank", "calldays"] compare = joined[cols] == har_df[cols] assert all(compare.all())
def test_cluster_is_within_envelope(get_dataframe): """ Test that all the clusters are within the enveloped formed by all the towers in the cluster. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() sites = Sites().to_geopandas().set_index(["site_id", "version"]) towers = GeoSeries(har_df.apply(lambda x: get_geom_point(x, sites), 1)) s = har_df.intersects(towers) assert all(s)
def test_call_threshold_works(get_dataframe): """ Test whether a call threshold above 1 limits the number of clusters. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() assert any(har_df.calldays == 1) har_df_higher_call_threshold = get_dataframe( HartiganCluster(calldays=cd, radius=50, call_threshold=2)) assert len(har_df) > len(har_df_higher_call_threshold)
def test_first_call_day_in_first_cluster(get_dataframe): """ Test that the first ranked call day of each subscriber is in the first cluster of each subscriber. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) cd_df = get_dataframe(cd) hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() cd_first = cd_df[["subscriber", "site_id", "version"]].groupby("subscriber").first() har_first = (har_df[["subscriber", "site_id", "version"]].groupby("subscriber").first()) joined = cd_first.join(har_first, lsuffix="_cd", rsuffix="_har") s = joined.apply( lambda x: (x.site_id_cd in x.site_id_har) and (x.version_cd in x.version_har), axis=1, ) assert all(s)
class TestHartiganCluster(TestCase): def setUp(self): self.cd = CallDays("2016-01-01", "2016-01-04", level="versioned-site") self.cd_df = self.cd.get_dataframe() self.hartigan = HartiganCluster(self.cd, 50) self.har_df = self.hartigan.to_geopandas() def tearDown(self): [cd.invalidate_db_cache() for cd in CallDays.get_stored()] def test_returns_dataframe(self): """ Tests that it returns a dataframe with the correct columns """ self.assertIsInstance(self.har_df, pd.DataFrame) self.assertEquals( set(self.har_df.columns), set([ "subscriber", "geometry", "rank", "calldays", "site_id", "version", "centroid", ]), ) def test_cluster_is_within_envelope(self): """ Test that all the clusters are within the enveloped formed by all the towers in the cluster. """ sites = Sites().to_geopandas() self.har_df["towers"] = self.har_df.apply( lambda x: get_geom_point(x, sites), 1) s = self.har_df.apply( lambda x: x.geometry.intersects(box(*x.towers.bounds)), axis=1) self.assertTrue(all(s)) def test_first_call_day_in_first_cluster(self): """ Test that the first ranked call day of each subscriber is in the first cluster of each subscriber. """ cd_first = (self.cd_df[["subscriber", "site_id", "version"]].groupby("subscriber").first()) har_first = (self.har_df[["subscriber", "site_id", "version"]].groupby("subscriber").first()) joined = cd_first.join(har_first, lsuffix="_cd", rsuffix="_har") s = joined.apply( lambda x: (x.site_id_cd in x.site_id_har) and (x.version_cd in x.version_har), axis=1, ) self.assertTrue(all(s)) def test_bigger_radius_yields_fewer_clusters(self): """ Test whether bigger radius yields fewer clusters per subscriber """ radius = [1, 2, 5, 10, 50] h = HartiganCluster(self.cd, radius[0]).get_dataframe() nclusters_small_radius = h.groupby("subscriber").size() for r in radius[1:]: h = HartiganCluster(self.cd, r).get_dataframe() nclusters_big_radius = h.groupby("subscriber").size() self.assertTrue( all(nclusters_small_radius >= nclusters_big_radius)) nclusters_small_radius = nclusters_big_radius def test_different_call_days_format(self): """ Test whether we can pass different call days format such as table name, SQL query and CallDays class. """ cd = CallDays("2016-01-01", "2016-01-04", level="versioned-site") har = HartiganCluster(cd, 50).get_dataframe() self.assertIsInstance(har, pd.DataFrame) cd.store().result() har = HartiganCluster(Table(cd.table_name), 50).get_dataframe() self.assertIsInstance(har, pd.DataFrame) cd_query = cd.get_query() har = HartiganCluster(CustomQuery(cd_query), 50).get_dataframe() self.assertIsInstance(har, pd.DataFrame) def test_call_threshold_works(self): """ Test whether a call threshold above 1 limits the number of clusters. """ self.assertTrue(any(self.har_df.calldays == 1)) har = HartiganCluster(self.cd, 50, call_threshold=2).get_dataframe() self.assertFalse(all(self.har_df.calldays > 1)) def test_buffered_hartigan(self): """ Test whether Hartigan produces buffered clusters when buffer is larger than 0. """ har = HartiganCluster(self.cd, 50, buffer=2).to_geopandas() areas = har.geometry.area # since the mock data does not have geom_area in the site table we either # get the clusters with area equivalent to 2km2 (areas below are in degrees) or None. min_area = areas.min() max_area = areas.max() self.assertAlmostEquals(min_area, 0.001, 3) self.assertAlmostEquals(max_area, 0.001, 3) def test_all_options_hartigan(self): """ Test whether Hartigan works when changing all options. """ har = HartiganCluster(self.cd, 50, buffer=2, call_threshold=2).to_geopandas() self.assertIsInstance(har, pd.DataFrame) self.assertEquals( set(har.columns), set([ "subscriber", "geometry", "rank", "calldays", "site_id", "version", "centroid", ]), ) def test_join_returns_the_same_clusters(self): """ Test whether joining to another table for which the start and stop time are the same yields the same clusters. """ es = EventScore(start="2016-01-01", stop="2016-01-04", level="versioned-site") joined = (self.hartigan.join_to_cluster_components( es).to_geopandas().sort_values(["subscriber", "rank", "calldays"])) joined.reset_index(inplace=True, drop=True) self.har_df.sort_values(["subscriber", "rank", "calldays"], inplace=True) self.har_df.reset_index(inplace=True, drop=True) cols = ["subscriber", "geometry", "rank", "calldays"] compare = joined[cols] == self.har_df[cols] self.assertTrue(all(compare.all()))