def test_invalid_reference_raises_error(): """ Test that passing an invalid reference location raises an error. """ with pytest.raises( ValueError, match= "Argument 'reference_location' should be an instance of BaseLocation class or a tuple of two floats. Got: str", ): DistanceSeries( subscriber_locations=SubscriberLocations( "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit("lon-lat")), reference_location="NOT_A_LOCATION", )
def test_active_at_reference_location(get_dataframe): """ Values test for active at reference location. """ activity = ActiveAtReferenceLocation( subscriber_locations=UniqueLocations( SubscriberLocations( "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=3), )), reference_locations=daily_location("2016-01-03"), ) df = get_dataframe(activity).set_index("subscriber") assert not df.loc["038OVABN11Ak4W5P"][0] assert df.loc["09NrjaNNvDanD8pk"][0]
def test_call_days_returns_expected_counts_per_subscriber(get_dataframe): """ Test that the returned data is correct for a given subscriber. """ test_values = ( ("Z89mWDgZrr3qpnlB", "2016-01-01", "2016-01-03", 9), ("Z89mWDgZrr3qpnlB", "2016-01-01", "2016-01-08", 30), ("038OVABN11Ak4W5P", "2016-01-01", "2016-01-03", 6), ("038OVABN11Ak4W5P", "2016-01-01", "2016-01-08", 32), ) for (subscriber, start, end, calls) in test_values: cd = CallDays( SubscriberLocations( start, end, spatial_unit=make_spatial_unit("versioned-site"))) df = get_dataframe(cd).query('subscriber == "{}"'.format(subscriber)) assert df.calldays.sum() == calls
def test_unmoving_at_reference_location_column_names(get_column_names_from_run): assert ( get_column_names_from_run( UnmovingAtReferenceLocation( locations=UniqueLocations( SubscriberLocations( "2016-01-01", "2016-01-01 10:00", spatial_unit=make_spatial_unit("admin", level=3), ) ), reference_locations=LastLocation("2016-01-01", "2016-01-02"), ) ) == ["subscriber", "value"] )
def test_spatial_unit_mismatch_error(): with pytest.raises(ValueError, match="Spatial unit mismatch"): UnmovingAtReferenceLocation( locations=UniqueLocations( SubscriberLocations( "2016-01-01", "2016-01-01 10:00", spatial_unit=make_spatial_unit("admin", level=2), ) ), reference_locations=LastLocation( "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=3), ), )
def test_active_at_reference_location_counts(get_dataframe): """ Values test for active at reference location counts. """ activity = ActiveAtReferenceLocationCounts( ActiveAtReferenceLocation( subscriber_locations=UniqueLocations( SubscriberLocations( "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=3), )), reference_locations=daily_location("2016-01-03"), )) df = get_dataframe(activity).set_index("pcod") assert df.loc["524 1 01 04"][0] == 1
def test_bad_window(size, match): """ Test some median unfriendly window sizes raise errors. """ with pytest.raises(ValueError, match=match): sl = SubscriberLocations( "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit("lon-lat"), hours=(20, 0), ) ds = DistanceSeries(subscriber_locations=sl, statistic="min") IterativeMedianFilter( query_to_filter=ImputedDistanceSeries(distance_series=ds), filter_window_size=size, )
def test_unique_locations(get_dataframe): """ Values test for unique locations. """ unique_locs = UniqueLocations( SubscriberLocations( "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=3), )) df = get_dataframe(unique_locs).set_index("subscriber") assert df.loc["038OVABN11Ak4W5P"].pcod.tolist() == [ "524 2 04 20", "524 3 08 43", "524 4 12 62", "524 4 12 65", ]
def test_column_must_exist(column_arg): """ Check errors for required columns. """ with pytest.raises(ValueError, match=column_arg): sl = SubscriberLocations( "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit("lon-lat"), hours=(20, 0), ) ds = DistanceSeries(subscriber_locations=sl, statistic="min") IterativeMedianFilter( query_to_filter=ImputedDistanceSeries(distance_series=ds), filter_window_size=3, **{column_arg: "NOT_A_VALID_COLUMN"}, )
def _unsampled_query_obj(self): """ Return the underlying flowmachine unique locations object. Returns ------- Query """ return UniqueLocations( SubscriberLocations( self.start_date, self.end_date, spatial_unit=self.aggregation_unit, table=self.event_types, subscriber_subset=self.subscriber_subset, ) )
def _make_meaningful_locations_object( *, start_date, end_date, label, labels, event_types, subscriber_subset, tower_cluster_call_threshold, tower_cluster_radius, tower_day_of_week_scores, tower_hour_of_day_scores, ): q_subscriber_locations = SubscriberLocations( start=start_date, stop=end_date, spatial_unit=make_spatial_unit( "versioned-site" ), # note this 'spatial_unit' is not the same as the exposed parameter 'aggregation_unit' table=event_types, subscriber_subset=subscriber_subset, ) q_call_days = CallDays(subscriber_locations=q_subscriber_locations) q_hartigan_cluster = HartiganCluster( calldays=q_call_days, radius=tower_cluster_radius, call_threshold=tower_cluster_call_threshold, buffer= 0, # we're not exposing 'buffer', apparently, so we're hard-coding it ) q_event_score = EventScore( start=start_date, stop=end_date, score_hour=tower_hour_of_day_scores, score_dow=tower_day_of_week_scores, spatial_unit=make_spatial_unit( "versioned-site" ), # note this 'spatial_unit' is not the same as the exposed parameter 'aggregation_unit' table=event_types, subscriber_subset=subscriber_subset, ) q_meaningful_locations = MeaningfulLocations(clusters=q_hartigan_cluster, labels=labels, scores=q_event_score, label=label) return q_meaningful_locations
def test_unmoving_counts_column_names(get_column_names_from_run): assert ( get_column_names_from_run( UnmovingCounts( Unmoving( locations=UniqueLocations( SubscriberLocations( "2016-01-01", "2016-01-01 10:00", spatial_unit=make_spatial_unit("admin", level=3), ) ) ) ) ) == ["pcod", "value"] )
def test_buffered_hartigan(): """ Test whether Hartigan produces buffered clusters when buffer is larger than 0. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) har = HartiganCluster(calldays=cd, radius=50, buffer=2).to_geopandas() areas = har.geometry.area # since the mock data does not have geom_area in the site table we either # get the clusters with area equivalent to 2km2 (areas below are in degrees) or None. min_area = areas.min() max_area = areas.max() assert min_area == pytest.approx(0.0011327683603873115) assert max_area == pytest.approx(0.001166624454009738)
def test_call_days_returns_expected_counts_per_subscriber_tower(get_dataframe): """ Test that the returned data is correct for a given subscriber-location pair. """ test_values = ( ("Z89mWDgZrr3qpnlB", "m9jL23", "2016-01-01", "2016-01-03", 2), ("Z89mWDgZrr3qpnlB", "qvkp6J", "2016-01-01", "2016-01-08", 4), ("038OVABN11Ak4W5P", "QeBRM8", "2016-01-01", "2016-01-03", 1), ("038OVABN11Ak4W5P", "nWM8R3", "2016-01-01", "2016-01-08", 5), ) for (subscriber, location, start, end, calls) in test_values: cd = CallDays( SubscriberLocations( start, end, spatial_unit=make_spatial_unit("versioned-site"))) df = get_dataframe(cd).query( 'subscriber == "{}" & site_id == "{}"'.format( subscriber, location)) assert df.value.values[0] == calls
def test_bigger_radius_yields_fewer_clusters(get_dataframe): """ Test whether bigger radius yields fewer clusters per subscriber """ radius = [1, 2, 5, 10, 50] cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) h = get_dataframe(HartiganCluster(calldays=cd, radius=radius[0])) nclusters_small_radius = h.groupby("subscriber").size() for r in radius[1:]: h = get_dataframe(HartiganCluster(calldays=cd, radius=r)) nclusters_big_radius = h.groupby("subscriber").size() assert all(nclusters_small_radius >= nclusters_big_radius) nclusters_small_radius = nclusters_big_radius
def test_returns_expected_values_fixed_point(stat, sub_a_expected, sub_b_expected, get_dataframe): """ Test that we get expected return values for the various statistics with 0, 0 reference """ sub_a_id, sub_b_id = "j6QYNbMJgAwlVORP", "NG1km5NzBg5JD8nj" df = get_dataframe( DistanceSeries( subscriber_locations=SubscriberLocations( "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit("lon-lat")), statistic=stat, )).set_index(["subscriber", "datetime"]) assert df.loc[(sub_a_id, date(2016, 1, 1))].value == pytest.approx(sub_a_expected) assert df.loc[(sub_b_id, date(2016, 1, 6))].value == pytest.approx(sub_b_expected)
def test_unmoving_at_reference_location_counts_values(get_dataframe): df = get_dataframe( UnmovingAtReferenceLocationCounts( UnmovingAtReferenceLocation( locations=UniqueLocations( SubscriberLocations( "2016-01-01", "2016-01-01 10:00", spatial_unit=make_spatial_unit("admin", level=1), )), reference_locations=LastLocation( "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=1), ), ))).set_index("pcod") assert df.loc["524 1"].value == 2 assert df.loc["524 4"].value == 26
def test_redacted_active_at_reference_location_counts(get_dataframe): """ Values test for redacted active at reference location counts. """ activity = RedactedActiveAtReferenceLocationCounts( active_at_reference_location_counts=ActiveAtReferenceLocationCounts( ActiveAtReferenceLocation( subscriber_locations=UniqueLocations( SubscriberLocations( "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=3), )), reference_locations=daily_location("2016-01-03"), ))) df = get_dataframe(activity).set_index("pcod") assert all(df.value > 15) assert len(df) == 2 assert df.loc["524 3 08 44"].value == 25
def test_unmoving_at_reference_location_counts_column_names( get_column_names_from_run): assert get_column_names_from_run( RedactedUnmovingAtReferenceLocationCounts( unmoving_at_reference_location_counts= UnmovingAtReferenceLocationCounts( UnmovingAtReferenceLocation( locations=UniqueLocations( SubscriberLocations( "2016-01-01", "2016-01-01 10:00", spatial_unit=make_spatial_unit("admin", level=1), )), reference_locations=LastLocation( "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=1), ), )))) == ["pcod", "value"]
def test_impute(get_dataframe): sl = SubscriberLocations( "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit("lon-lat"), hours=(20, 0), ) ds = DistanceSeries(subscriber_locations=sl, statistic="min") ds_df = get_dataframe(ds) sql = get_dataframe(ImputedDistanceSeries(distance_series=ds)) all_subs = ds_df.subscriber.drop_duplicates() for sub in all_subs: print(sub) if ds_df[ds_df.subscriber == sub].datetime.nunique() > 3: to_be_imputed = ds_df[ds_df.subscriber == sub].sort_values( "datetime") imputed = fill_in_dates(to_be_imputed, 3, sl.start, sl.stop) assert imputed.value.values.tolist() == pytest.approx( sql[sql.subscriber == sub].value.tolist())
def test_unique_visitor_counts_column_names(get_column_names_from_run): assert (get_column_names_from_run( UniqueVisitorCounts( ActiveAtReferenceLocationCounts( ActiveAtReferenceLocation( subscriber_locations=UniqueLocations( SubscriberLocations( "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=3), )), reference_locations=daily_location("2016-01-03"), )), UniqueSubscriberCounts( "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=3), ), )) == ["pcod", "value"])
def test_error_on_spatial_unit_mismatch(): """ Test that error is raised if the spatial unit of the subscriber locations isn't point. """ rl = daily_location("2016-01-01", spatial_unit=make_spatial_unit("admin", level=3)) with pytest.raises( ValueError, match= "reference_location must have the same spatial unit as subscriber_locations.", ): DistanceSeries( subscriber_locations=SubscriberLocations( "2016-01-01", "2016-01-07", spatial_unit=make_spatial_unit("lon-lat")), reference_location=rl, )
def test_column_names_meaningful_locations(get_column_names_from_run): """ Test that column_names property matches head(0) for meaningfullocations""" mfl = MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="evening", ) assert get_column_names_from_run(mfl) == mfl.column_names
def test_join_with_polygon(get_dataframe, get_length): """ Test that flowmachine.JoinToLocation can get the (arbitrary) polygon of each cell. """ ul = SubscriberLocations("2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell")) j = JoinToLocation( ul, spatial_unit=make_spatial_unit( "polygon", region_id_column_name="admin3pcod", geom_table="geography.admin3", geom_column="geom", ), ) df = get_dataframe(j) expected_cols = sorted(["admin3pcod", "location_id", "subscriber", "time"]) assert sorted(df.columns) == expected_cols assert len(df) == get_length(ul)
def test_join_with_versioned_cells(get_dataframe, get_length): """ Test that flowmachine.JoinToLocation can fetch the cell version. """ ul = SubscriberLocations("2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell")) df = get_dataframe( JoinToLocation(ul, spatial_unit=make_spatial_unit("versioned-cell"))) # As our database is complete we should not drop any rows assert len(df) == get_length(ul) # These should all be version zero, these are the towers before the changeover date, or those that # have not moved. should_be_version_zero = df[(df.time <= move_date) | (~df.location_id.isin(moving_sites))] # These should all be one, they are the ones after the change over time that have moved. should_be_version_one = df[(df.time > move_date) & (df.location_id.isin(moving_sites))] assert (should_be_version_zero.version == 0).all() assert (should_be_version_one.version == 1).all()
def test_unique_visitor_counts(get_dataframe): """ Values test for unique visitor counts. """ activity = UniqueVisitorCounts( ActiveAtReferenceLocationCounts( ActiveAtReferenceLocation( subscriber_locations=UniqueLocations( SubscriberLocations( "2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=3), )), reference_locations=daily_location("2016-01-03"), )), UniqueSubscriberCounts("2016-01-01", "2016-01-02", spatial_unit=make_spatial_unit("admin", level=3)), ) df = get_dataframe(activity).set_index("pcod") assert df.loc["524 1 01 04"].value == 66 assert df.loc["524 3 08 44"].value == 170
def test_join_with_lon_lat(get_dataframe): """ Test that flowmachine.JoinToLocation can get the lon-lat values of the cell """ ul = SubscriberLocations( "2016-01-05", "2016-01-07", spatial_unit=make_spatial_unit("cell") ) df = get_dataframe(JoinToLocation(ul, spatial_unit=make_spatial_unit("lon-lat"))) expected_cols = sorted(["subscriber", "time", "location_id", "lon", "lat"]) assert sorted(df.columns) == expected_cols # Pick out one cell that moves location and assert that the # lon-lats are right focal_cell = "dJb0Wd" lon1, lat1 = (83.09284486, 27.648837800000003) lon2, lat2 = (83.25769074752517, 27.661443318109132) post_move = df[(df.time > move_date) & (df["location_id"] == focal_cell)] pre_move = df[(df.time < move_date) & (df["location_id"] == focal_cell)] # And check them all one-by-one np.isclose(pre_move.lon, lon1).all() np.isclose(pre_move.lat, lat1).all() np.isclose(post_move.lon, lon2).all() np.isclose(post_move.lat, lat2).all()
def test_different_call_days_format(get_dataframe): """ Test whether we can pass different call days format such as table name, SQL query and CallDays class. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) har = get_dataframe(HartiganCluster(calldays=cd, radius=50)) assert isinstance(har, pd.DataFrame) cd.store().result() har = get_dataframe( HartiganCluster(calldays=Table(cd.fully_qualified_table_name), radius=50)) assert isinstance(har, pd.DataFrame) cd_query = cd.get_query() har = get_dataframe( HartiganCluster(calldays=CustomQuery(cd_query, cd.column_names), radius=50)) assert isinstance(har, pd.DataFrame)
def test_meaningful_locations_aggregate_disallowed_spatial_unit_raises(): """ Test that a bad spatial unit raises an InvalidSpatialUnitError""" with pytest.raises(InvalidSpatialUnitError): mfl_agg = MeaningfulLocationsAggregate( meaningful_locations=MeaningfulLocations( clusters=HartiganCluster( calldays=CallDays(subscriber_locations=SubscriberLocations( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), )), radius=1, ), scores=EventScore( start="2016-01-01", stop="2016-01-02", spatial_unit=make_spatial_unit("versioned-site"), ), labels=labels, label="evening", ), spatial_unit=make_spatial_unit("lon-lat"), )
def test_first_call_day_in_first_cluster(get_dataframe): """ Test that the first ranked call day of each subscriber is in the first cluster of each subscriber. """ cd = CallDays( SubscriberLocations("2016-01-01", "2016-01-04", spatial_unit=make_spatial_unit("versioned-site"))) cd_df = get_dataframe(cd) hartigan = HartiganCluster(calldays=cd, radius=50) har_df = hartigan.to_geopandas() cd_first = cd_df[["subscriber", "site_id", "version"]].groupby("subscriber").first() har_first = (har_df[["subscriber", "site_id", "version"]].groupby("subscriber").first()) joined = cd_first.join(har_first, lsuffix="_cd", rsuffix="_har") s = joined.apply( lambda x: (x.site_id_cd in x.site_id_har) and (x.version_cd in x.version_har), axis=1, ) assert all(s)