def test_all_options_hartigan(): """ Test whether Hartigan works when changing all options. """ cd = CallDays( subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site")) har = HartiganCluster(calldays=cd, radius=50, buffer=2, call_threshold=2).to_geopandas() assert set(har.columns) == set([ "subscriber", "geometry", "rank", "calldays", "site_id", "version", "centroid" ])
def test_call_threshold_works(get_dataframe): """ Test whether a call threshold above 1 limits the number of clusters. """ cd = CallDays( subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site")) hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() assert any(har_df.calldays == 1) har_df_higher_call_threshold = get_dataframe( HartiganCluster(calldays=cd, radius=50, call_threshold=2)) assert len(har_df) > len(har_df_higher_call_threshold)
def test_call_days_returns_expected_counts_per_subscriber_tower(get_dataframe): """ Test that the returned data is correct for a given subscriber-location pair. """ test_values = ( ("Z89mWDgZrr3qpnlB", "m9jL23", "2016-01-01", "2016-01-03", 2), ("Z89mWDgZrr3qpnlB", "qvkp6J", "2016-01-01", "2016-01-08", 4), ("038OVABN11Ak4W5P", "QeBRM8", "2016-01-01", "2016-01-03", 1), ("038OVABN11Ak4W5P", "nWM8R3", "2016-01-01", "2016-01-08", 5), ) for (subscriber, location, start, end, calls) in test_values: cd = CallDays(subscriber_locations(start, end, level="versioned-site")) df = get_dataframe(cd).query( 'subscriber == "{}" & site_id == "{}"'.format( subscriber, location)) assert df.calldays.values[0] == calls
def test_column_names_meaningful_locations(get_column_names_from_run): """ Test that column_names property matches head(0) for meaningfullocations""" mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", level="versioned-site")), radius=1, ), scores=EventScore(start="2016-01-01", stop="2016-01-02", level="versioned-site"), labels=labels, label="evening", ) assert get_column_names_from_run(mfl) == mfl.column_names
def test_buffered_hartigan(): """ Test whether Hartigan produces buffered clusters when buffer is larger than 0. """ cd = CallDays( subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site")) har = HartiganCluster(calldays=cd, radius=50, buffer=2).to_geopandas() areas = har.geometry.area # since the mock data does not have geom_area in the site table we either # get the clusters with area equivalent to 2km2 (areas below are in degrees) or None. min_area = areas.min() max_area = areas.max() assert min_area == pytest.approx(0.0011327683603873115) assert max_area == pytest.approx(0.001166624454009738)
def test_join_with_polygon(get_dataframe, get_length): """ Test that flowmachine.JoinToLocation can get the (arbitrary) polygon of each cell. """ ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") j = JoinToLocation( ul, level="polygon", column_name="admin3pcod", polygon_table="geography.admin3", geom_col="geom", ) df = get_dataframe(j) expected_cols = sorted(["admin3pcod", "location_id", "subscriber", "time"]) assert sorted(df.columns) == expected_cols assert len(df) == get_length(ul)
def test_bigger_radius_yields_fewer_clusters(get_dataframe): """ Test whether bigger radius yields fewer clusters per subscriber """ radius = [1, 2, 5, 10, 50] cd = CallDays( subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site")) h = get_dataframe(HartiganCluster(calldays=cd, radius=radius[0])) nclusters_small_radius = h.groupby("subscriber").size() for r in radius[1:]: h = get_dataframe(HartiganCluster(calldays=cd, radius=r)) nclusters_big_radius = h.groupby("subscriber").size() assert all(nclusters_small_radius >= nclusters_big_radius) nclusters_small_radius = nclusters_big_radius
def test_join_with_versioned_cells(get_dataframe, get_length): """ Test that flowmachine.JoinToLocation can fetch the cell version. """ ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") df = get_dataframe(JoinToLocation(ul, level="versioned-cell")) # As our database is complete we should not drop any rows assert len(df) == get_length(ul) # These should all be version zero, these are the towers before the changeover date, or those that # have not moved. should_be_version_zero = df[(df.time <= move_date) | (~df.location_id.isin(moving_sites))] # These should all be one, they are the ones after the change over time that have moved. should_be_version_one = df[(df.time > move_date) & (df.location_id.isin(moving_sites))] assert (should_be_version_zero.version == 0).all() assert (should_be_version_one.version == 1).all()
def test_join_with_lat_lon(get_dataframe): """ Test that flowmachine.JoinToLocation can get the lat-lon values of the cell """ ul = subscriber_locations("2016-01-05", "2016-01-07", level="cell") df = get_dataframe(JoinToLocation(ul, level="lat-lon")) expected_cols = sorted(["subscriber", "time", "location_id", "lat", "lon"]) assert sorted(df.columns) == expected_cols # Pick out one cell that moves location and assert that the # lat-lons are right focal_cell = "dJb0Wd" lat1, long1 = (27.648837800000003, 83.09284486) lat2, long2 = (27.661443318109132, 83.25769074752517) post_move = df[(df.time > move_date) & (df["location_id"] == focal_cell)] pre_move = df[(df.time < move_date) & (df["location_id"] == focal_cell)] # And check them all one-by-one np.isclose(pre_move.lat, lat1).all() np.isclose(pre_move.lon, long1).all() np.isclose(post_move.lat, lat2).all() np.isclose(post_move.lon, long2).all()
def _make_meaningful_locations_object( *, start_date, stop_date, label, labels, subscriber_subset, tower_cluster_call_threshold, tower_cluster_radius, tower_day_of_week_scores, tower_hour_of_day_scores, ): q_subscriber_locations = subscriber_locations( start=start_date, stop=stop_date, level="versioned-site", # note this 'level' is not the same as the exposed parameter 'aggregation_unit' subscriber_subset=subscriber_subset, ) q_call_days = CallDays(subscriber_locations=q_subscriber_locations) q_hartigan_cluster = HartiganCluster( calldays=q_call_days, radius=tower_cluster_radius, call_threshold=tower_cluster_call_threshold, buffer=0, # we're not exposing 'buffer', apparently, so we're hard-coding it ) q_event_score = EventScore( start=start_date, stop=stop_date, score_hour=tower_hour_of_day_scores, score_dow=tower_day_of_week_scores, level="versioned-site", # note this 'level' is not the same as the exposed parameter 'aggregation_unit' subscriber_subset=subscriber_subset, ) return MeaningfulLocations( clusters=q_hartigan_cluster, labels=labels, scores=q_event_score, label=label, )
def test_meaningful_locations_aggregate_disallowed_level_raises(): """ Test that a bad level raises a BadLevelError""" with pytest.raises(BadLevelError): mfl_agg = MeaningfulLocationsAggregate( meaningful_locations=MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays( subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", level="versioned-site", )), radius=1, ), scores=EventScore(start="2016-01-01", stop="2016-01-02", level="versioned-site"), labels=labels, label="evening", ), level="NOT_A_LEVEL", )
def test_meaningful_locations_od_raises_for_bad_level(exemplar_level_param, get_dataframe): """ Test that od on meaningful locations raises a BadLevelError for a bad level. """ mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", level="versioned-site")), radius=1, ), scores=EventScore(start="2016-01-01", stop="2016-01-02", level="versioned-site"), labels=labels, label="evening", ) with pytest.raises(BadLevelError): mfl_od = MeaningfulLocationsOD(meaningful_locations_a=mfl, meaningful_locations_b=mfl, level="NOT_A_LEVEL")
def test_different_call_days_format(get_dataframe): """ Test whether we can pass different call days format such as table name, SQL query and CallDays class. """ cd = CallDays( subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site")) har = get_dataframe(HartiganCluster(calldays=cd, radius=50)) assert isinstance(har, pd.DataFrame) cd.store().result() har = get_dataframe( HartiganCluster(calldays=Table(cd.fully_qualified_table_name), radius=50)) assert isinstance(har, pd.DataFrame) cd_query = cd.get_query() har = get_dataframe( HartiganCluster(calldays=CustomQuery(cd_query, cd.column_names), radius=50)) assert isinstance(har, pd.DataFrame)
def test_meaningful_locations_results(label, expected_number_of_clusters, get_dataframe): """ Test that MeaningfulLocations returns expected results and counts clusters per subscriber correctly. """ mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=subscriber_locations( start="2016-01-01", stop="2016-01-02", level="versioned-site")), radius=1, ), scores=EventScore(start="2016-01-01", stop="2016-01-02", level="versioned-site"), labels=labels, label=label, ) mfl_df = get_dataframe(mfl) assert len(mfl_df) == expected_number_of_clusters count_clusters = mfl_df.groupby(["subscriber", "label", "n_clusters"], as_index=False).count() # Check that query has correctly counted the number of clusters per subscriber assert all(count_clusters.n_clusters == count_clusters.cluster)
def test_first_call_day_in_first_cluster(get_dataframe): """ Test that the first ranked call day of each subscriber is in the first cluster of each subscriber. """ cd = CallDays( subscriber_locations("2016-01-01", "2016-01-04", level="versioned-site")) cd_df = get_dataframe(cd) hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() cd_first = cd_df[["subscriber", "site_id", "version"]].groupby("subscriber").first() har_first = (har_df[["subscriber", "site_id", "version"]].groupby("subscriber").first()) joined = cd_first.join(har_first, lsuffix="_cd", rsuffix="_har") s = joined.apply( lambda x: (x.site_id_cd in x.site_id_har) and (x.version_cd in x.version_har), axis=1, ) assert all(s)
def test_calldays_column_names(exemplar_level_param): """Test that CallDays column_names property is correct""" cd = CallDays( subscriber_locations("2016-01-01", "2016-01-03", **exemplar_level_param)) assert cd.head(0).columns.tolist() == cd.column_names