def test_dl_count_sum_equal_or_less_than_period(get_dataframe): """ Sum of LocationVisits per subscriber should not be more than total number of days between 'start_date' and 'stop_date' """ # test 1 days = 7 start_date = "2016-01-01" stop_date = "2016-01-07" lv = LocationVisits( DayTrajectories(*[ daily_location(d, level="admin3", method="last") for d in list_of_dates(start_date, stop_date) ])) df = get_dataframe(lv) assert df[df["subscriber"] == df.iloc[0, 0]]["dl_count"].sum() <= days # test 2 days = 3 start_date = "2016-01-01" stop_date = "2016-01-03" lv = LocationVisits( DayTrajectories(*[ daily_location(d, level="admin3", method="last") for d in list_of_dates(start_date, stop_date) ])) df = get_dataframe(lv) assert df[df["subscriber"] == df.iloc[0, 0]]["dl_count"].sum() <= days
def __init__(self, start, stop, method="home-location", level="versioned-site", **kwargs): warnings.warn("The PWO model is currently **experimental**. " + "Please review Yan X-Y et al. " + "(http://dx.doi.org/10.1098/rsif.2014.0834) " + "before using this model in production.") self.start = start self.stop = stop self.method = method self.level = level self.distance_matrix = DistanceMatrix(date=self.stop, level=level, return_geometry=True, **kwargs) if self.method == "home-location": self.population_object = ModalLocation(*[ daily_location(d, level=self.level, **kwargs) for d in list_of_dates(self.start, self.stop) ]).aggregate() self.population_buffer_object = _populationBuffer( level=self.level, population_object=self.population_object, distance_matrix=self.distance_matrix, )
def setUp(self): self.hl = HomeLocation(*[ daily_location(d) for d in list_of_dates("2016-01-01", "2016-01-03") ]) self.hdf = self.hl.get_dataframe() self.hdf = self.hl.get_dataframe().set_index("subscriber")
def _check_dates(self): # Handle the logic for dealing with missing dates. # If there are no dates present, then we raise an error # if some are present, but some are missing we raise a # warning. # If the subscriber does not pass a start or stop date, then we take # the min/max date in the events.calls table if self.start is None: d1 = ( get_db() .min_date(self.table_ORIG.fully_qualified_table_name.split(".")[1]) .strftime("%Y-%m-%d") ) else: d1 = self.start.split()[0] if self.stop is None: d2 = ( get_db() .max_date(self.table_ORIG.fully_qualified_table_name.split(".")[1]) .strftime("%Y-%m-%d") ) else: d2 = self.stop.split()[0] all_dates = list_of_dates(d1, d2) # Slightly annoying feature, but if the subscriber passes a date such as '2016-01-02' # this will be interpreted as midnight, so we don't want to include this in our # calculations. Check for this here, an if this is the case pop the final element # of the list if (self.stop is not None) and ( len(self.stop) == 10 or self.stop.endswith("00:00:00") ): all_dates.pop(-1) # This will be a true false list for whether each of the dates # is present in the database try: db_dates = [ d.strftime("%Y-%m-%d") for d in get_db().available_dates[self.table_ORIG.name] ] except KeyError: # No dates at all for this table raise MissingDateError dates_present = [d in db_dates for d in all_dates] logger.debug( f"Data for {sum(dates_present)}/{len(dates_present)} calendar dates." ) # All dates are missing if not any(dates_present): raise MissingDateError # Some dates are missing, others are present elif not all(dates_present): present_dates = [d for p, d in zip(dates_present, all_dates) if p] warnings.warn( f"{len(dates_present) - sum(dates_present)} of {len(dates_present)} calendar dates missing. Earliest date is {present_dates[0]}, latest is {present_dates[-1]}.", stacklevel=2, )
def __init__( self, start: str, stop: str, *, spatial_unit: Optional[LonLatSpatialUnit] = None, departure_rate: Union[pd.DataFrame, float] = 0.1, hours: Union[str, Tuple[int, int]] = "all", method: str = "last", table: Union[str, List[str]] = "all", subscriber_identifier: str = "msisdn", subscriber_subset: Optional[Query] = None, ): warnings.warn( "The PopulationWeightedOpportunities model is currently **experimental**. " + "Please review Yan X-Y et al. " + "(http://dx.doi.org/10.1098/rsif.2014.0834) " + "before using this model in production.") if isinstance(departure_rate, pd.DataFrame): # Rename the columns to match what we'll join to # sort the dataframe so we'll have a consistent md5 self.departure_rate = departure_rate.rename( columns=lambda x: x if x == "rate" else f"{x}_from").apply( lambda x: x.sort_values().values) self.departure_rate = self.departure_rate.reindex( columns=sorted(self.departure_rate.columns)) elif isinstance(departure_rate, float): self.departure_rate = departure_rate else: raise TypeError(f"{departure_rate} must be a float or dataframe") self.start = start self.stop = stop if spatial_unit is None: self.spatial_unit = make_spatial_unit("versioned-site") else: self.spatial_unit = spatial_unit self.distance_matrix = DistanceMatrix(spatial_unit=self.spatial_unit, return_geometry=True) self.population_object = ModalLocation(*[ daily_location( d, spatial_unit=self.spatial_unit, hours=hours, method=method, table=table, subscriber_identifier=subscriber_identifier, ignore_nulls=True, subscriber_subset=subscriber_subset, ) for d in list_of_dates(self.start, self.stop) ]).aggregate() self.population_buffer_object = _PopulationBuffer( population_object=self.population_object, distance_matrix=self.distance_matrix, )
def test_subset_correct(self): """Test that pushed in subsetting matches .subset result""" su = EventTableSubset("2016-01-01", "2016-01-03", subscriber_subset=self.subscriber_list) subsu = EventTableSubset("2016-01-01", "2016-01-03").subset("subscriber", self.subscriber_list) self.assertTrue(all(su.get_dataframe() == subsu.get_dataframe())) su = HomeLocation(*[ daily_location(d, subscriber_subset=self.subscriber_list) for d in list_of_dates("2016-01-01", "2016-01-07") ]) subsu = HomeLocation(*[ daily_location(d) for d in list_of_dates("2016-01-01", "2016-01-03") ]).subset("subscriber", self.subscriber_list) self.assertTrue(all(su.get_dataframe() == subsu.get_dataframe()))
def test_subset_correct(subscriber_list, get_dataframe): """Test that pushed in subsetting matches .subset result""" su = EventTableSubset(start="2016-01-01", stop="2016-01-03", subscriber_subset=subscriber_list) subsu = EventTableSubset(start="2016-01-01", stop="2016-01-03").subset("subscriber", subscriber_list) assert all(get_dataframe(su) == get_dataframe(subsu)) su = ModalLocation(*[ daily_location(d, subscriber_subset=subscriber_list) for d in list_of_dates("2016-01-01", "2016-01-07") ]) subsu = ModalLocation( * [daily_location(d) for d in list_of_dates("2016-01-01", "2016-01-03")]).subset( "subscriber", subscriber_list) assert all(get_dataframe(su) == get_dataframe(subsu))
def test_error_when_home_location_not_latlong(): """ Test that error is raised if home location passed to class is not using level lat-lon """ hl = HomeLocation( *[daily_location(d) for d in list_of_dates("2016-01-01", "2016-01-02")] ) with pytest.raises(ValueError): Displacement("2016-01-01", "2016-01-02", home_locations=hl, statistic="avg")
def test_can_be_aggregated_lon_lat(get_dataframe): """ Query can be aggregated to a spatial level with lon-lat data. """ hl = ModalLocation(*[ daily_location( d, spatial_unit=make_spatial_unit("lon-lat"), method="last") for d in list_of_dates("2016-01-01", "2016-01-03") ]) agg = hl.aggregate() df = get_dataframe(agg) assert ["lon", "lat", "value"] == list(df.columns)
def test_can_be_aggregated_latlong(self): """ Query can be aggregated to a spatial level with lat-lon data. """ hl = HomeLocation(*[ daily_location(d, level="lat-lon", method="last") for d in list_of_dates("2016-01-01", "2016-01-03") ]) agg = hl.aggregate() df = agg.get_dataframe() self.assertIs(type(df), pd.DataFrame) self.assertEqual(list(df.columns), ["lat", "lon", "total"])
def test_error_when_modal_location_not_lon_lat(): """ Test that error is raised if home location passed to class is not using lon-lat spatial unit """ ml = ModalLocation( *[daily_location(d) for d in list_of_dates("2016-01-01", "2016-01-02")] ) with pytest.raises(ValueError): Displacement("2016-01-01", "2016-01-02", modal_locations=ml, statistic="avg")
def test_contact_reference_location_false_level_raises(): """ Test ValueError is raised for contact_location with non-compliant level. """ cb = ContactBalance("2016-01-01", "2016-01-03") ml = ModalLocation(*[ daily_location( d, level="admin3", subscriber_subset=cb.counterparts_subset(include_subscribers=True), ) for d in list_of_dates("2016-01-01", "2016-01-03") ]) with pytest.raises(ValueError): query = ContactReferenceLocationStats(cb, ml)
def test_contact_reference_location_stats_false_statistic_raises(): """ Test ValueError is raised for non-compliant statistics parameter. """ cb = ContactBalance("2016-01-01", "2016-01-03") ml = ModalLocation(*[ daily_location( d, spatial_unit=make_spatial_unit("versioned-cell"), subscriber_subset=cb.counterparts_subset(include_subscribers=True), ) for d in list_of_dates("2016-01-01", "2016-01-03") ]) with pytest.raises(ValueError): query = ContactReferenceLocationStats(cb, ml, statistic="error")
def __init__(self, start, stop, modal_locations=None, statistic="avg", unit="km", **kwargs): # need to subtract one day from hl end in order to be # comparing over same period... self.stop_sl = stop self.stop_hl = str(parse_datestring(stop) - relativedelta(days=1)) self.start = start allowed_levels = ["lat-lon", "versioned-cell", "versioned-site"] if modal_locations: if (isinstance(modal_locations, ModalLocation) and modal_locations.level in allowed_levels): hl = modal_locations else: raise ValueError( f"Argument 'modal_locations' should be an instance of ModalLocation class with level in {allowed_levels}" ) else: hl = ModalLocation(*[ daily_location(date, level="lat-lon", **kwargs) for date in list_of_dates(self.start, self.stop_hl) ]) sl = subscriber_locations(self.start, self.stop_sl, level="lat-lon", **kwargs) self.statistic = statistic.lower() if self.statistic not in valid_stats: raise ValueError( "{} is not a valid statistic. Use one of {}".format( self.statistic, valid_stats)) self.joined = hl.join( sl, on_left="subscriber", on_right="subscriber", how="left", left_append="_home_loc", right_append="", ) self.unit = unit super().__init__()
def test_can_be_aggregated_latlong(get_dataframe): """ Query can be aggregated to a spatial level with lat-lon data. """ hl = ModalLocation( *[ daily_location(d, level="lat-lon", method="last") for d in list_of_dates("2016-01-01", "2016-01-03") ] ) agg = hl.aggregate() df = get_dataframe(agg) assert ["lat", "lon", "total"] == list(df.columns)
def test_selected_values(get_dataframe): """ ModalLocation() values are correct. """ hdf = get_dataframe( ModalLocation( *[daily_location(d) for d in list_of_dates("2016-01-01", "2016-01-03")] ) ).set_index("subscriber") assert "Dolpa" == hdf.ix["038OVABN11Ak4W5P"][0] assert "Baglung" == hdf.ix["E1n7JoqxPBjvR5Ve"][0] assert "Myagdi" == hdf.ix["gkBLe0mN5j3qmRpX"][0] assert "Kapilbastu" == hdf.ix["5Kgwy8Gp6DlN3Eq9"][0]
def test_contact_reference_location_bad_spatial_unit_raises(): """ Test InvalidSpatialUnitError is raised for contact_location with non-compliant spatial unit. """ cb = ContactBalance("2016-01-01", "2016-01-03") ml = ModalLocation(*[ daily_location( d, spatial_unit=make_spatial_unit("admin", level=3), subscriber_subset=cb.counterparts_subset(include_subscribers=True), ) for d in list_of_dates("2016-01-01", "2016-01-03") ]) with pytest.raises(InvalidSpatialUnitError): query = ContactReferenceLocationStats(cb, ml)
def test_contact_reference_location_no_spatial_unit_raises(): """ Test ValueError is raised for contact_location without spatial_unit attribute. """ cb = ContactBalance("2016-01-01", "2016-01-03") # by encapsulating ModalLocations in a CustomQuery we remove the spatial_unit # attribute from it which should raise an error ml = ModalLocation(*[ daily_location( d, spatial_unit=make_spatial_unit("versioned-cell"), subscriber_subset=cb.counterparts_subset(include_subscribers=True), ) for d in list_of_dates("2016-01-01", "2016-01-03") ]) ml = CustomQuery(ml.get_query(), ml.column_names) with pytest.raises(ValueError): query = ContactReferenceLocationStats(cb, ml)
def test_contact_reference_location_stats(get_dataframe, statistic, msisdn, spatial_unit_type, want): """ Test a few hand-picked ContactReferenceLocationStats. """ cb = ContactBalance("2016-01-01", "2016-01-03") ml = ModalLocation(*[ daily_location( d, spatial_unit=make_spatial_unit(spatial_unit_type), subscriber_subset=cb.counterparts_subset(include_subscribers=True), ) for d in list_of_dates("2016-01-01", "2016-01-03") ]) cb.store() ml.store() query = ContactReferenceLocationStats(cb, ml, statistic=statistic) df = get_dataframe(query).set_index("subscriber") assert df.value[msisdn] == pytest.approx(want)
def test_get_all_users_in_modal_location(get_dataframe): """ This tests that diplacement values are returned for all subscribers in the home location object. """ p1 = ("2016-01-02 10:00:00", "2016-01-02 12:00:00") p2 = ("2016-01-01 12:01:00", "2016-01-01 15:20:00") hl = ModalLocation(*[ daily_location(d, level="lat-lon", hours=(12, 13)) for d in list_of_dates(p1[0], p1[1]) ]) d = Displacement(p2[0], p2[1], modal_locations=hl) hl_subscribers = set(get_dataframe(hl).subscriber) d_subscribers = set(get_dataframe(d).subscriber) assert not (hl_subscribers - d_subscribers)
def test_pass_modal_location(get_dataframe): """ Test that we can pass a home location object to the class """ hl = ModalLocation(*[ daily_location(d, level="lat-lon") for d in list_of_dates("2016-01-01", "2016-01-06") ]) df = get_dataframe( Displacement("2016-01-01", "2016-01-07", modal_locations=hl, statistic="avg")) df = df.set_index("subscriber") val = df.loc["j6QYNbMJgAwlVORP"].statistic assert val == pytest.approx(169.926194)
def test_subscriber_with_home_loc_but_no_calls_is_nan(get_dataframe): """ Test that a subscriber who has no activity between start and stop but has a home location returns a nan value """ p1 = ("2016-01-02 10:00:00", "2016-01-02 12:00:00") p2 = ("2016-01-01 12:01:00", "2016-01-01 15:20:00") subscriber = "OdM7np8LYEp1mkvP" hl = ModalLocation(*[ daily_location(d, level="lat-lon", hours=(12, 13)) for d in list_of_dates(p1[0], p1[1]) ]) d = Displacement(p2[0], p2[1], modal_locations=hl) df = get_dataframe(d).set_index("subscriber") assert isnan(df.loc[subscriber].statistic)
def test_pass_modal_location(get_dataframe): """ Test that we can pass a home location object to the class """ ml = ModalLocation( *[ daily_location(d, spatial_unit=make_spatial_unit("lon-lat")) for d in list_of_dates("2016-01-01", "2016-01-06") ] ) df = get_dataframe( Displacement("2016-01-01", "2016-01-07", modal_locations=ml, statistic="avg") ) df = df.set_index("subscriber") val = df.loc["j6QYNbMJgAwlVORP"].statistic assert val == pytest.approx(176.903620)
def __init__( self, start, stop, method="home-location", spatial_unit: Optional[LonLatSpatialUnit] = None, **kwargs, ): warnings.warn( "The PWO model is currently **experimental**. " + "Please review Yan X-Y et al. " + "(http://dx.doi.org/10.1098/rsif.2014.0834) " + "before using this model in production." ) self.start = start self.stop = stop self.method = method if spatial_unit is None: self.spatial_unit = make_spatial_unit("versioned-site") else: self.spatial_unit = spatial_unit self.distance_matrix = DistanceMatrix( spatial_unit=self.spatial_unit, return_geometry=True ) if self.method == "home-location": self.population_object = ModalLocation( *[ daily_location(d, spatial_unit=self.spatial_unit, **kwargs) for d in list_of_dates(self.start, self.stop) ] ).aggregate() self.population_buffer_object = _populationBuffer( population_object=self.population_object, distance_matrix=self.distance_matrix, )
def test_contact_reference_location_stats_custom_geometry(get_dataframe): """ Test ContactReferenceLocationStats with custom geometry column. """ cb = ContactBalance("2016-01-01", "2016-01-03") ml = ModalLocation(*[ daily_location( d, spatial_unit=make_spatial_unit("versioned-cell"), subscriber_subset=cb.counterparts_subset(include_subscribers=True), ) for d in list_of_dates("2016-01-01", "2016-01-03") ]) cb.store() ml.store() ml = CustomQuery( f"SELECT subscriber, ST_POINT(lon, lat) AS loc FROM ({ml.get_query()}) _", ["subscriber", "loc"], ) query = ContactReferenceLocationStats(cb, ml, statistic="avg", geom_column="loc") df = get_dataframe(query).set_index("subscriber") assert df.value["gwAynWXp4eWvxGP7"] == pytest.approx(298.7215)
def test_query_can_be_subscriber_set_restricted(subscriber_list_table, subscriber_list, get_dataframe): """Test that some queries can be limited to only a subset of subscribers.""" rog = RadiusOfGyration("2016-01-01", "2016-01-03", subscriber_subset=subscriber_list_table) hl = ModalLocation(*[ daily_location(d, subscriber_subset=subscriber_list_table) for d in list_of_dates("2016-01-01", "2016-01-03") ]) rog_df = get_dataframe(rog) hl_df = get_dataframe(hl) # Get the set of subscribers present in the dataframe, we need to handle the logic # of msisdn_from/msisdn_to calculated_subscriber_set = set(rog_df.subscriber) assert calculated_subscriber_set == set(subscriber_list) calculated_subscriber_set = set(hl_df.subscriber) assert calculated_subscriber_set == set(subscriber_list)
def test_query_can_be_subscriber_set_restricted(self): """Test that some queries can be limited to only a subset of subscribers.""" # Create a temporary table in the DB con = Table.connection.engine sql = "DROP TABLE IF EXISTS subscriber_list" con.execute(sql) sql = """CREATE TABLE subscriber_list (subscriber TEXT)""" con.execute(sql) formatted_subscribers = ",".join("('{}')".format(u) for u in self.subscriber_list) sql = """INSERT INTO subscriber_list (subscriber) VALUES {}""".format( formatted_subscribers) con.execute(sql) rog = RadiusOfGyration("2016-01-01", "2016-01-03", subscriber_subset=Table("subscriber_list")) hl = HomeLocation(*[ daily_location(d, subscriber_subset=Table("subscriber_list")) for d in list_of_dates("2016-01-01", "2016-01-03") ]) rog_df = rog.get_dataframe() hl_df = hl.get_dataframe() sql = "DROP TABLE IF EXISTS subscriber_list" con.execute(sql) # Get the set of subscribers present in the dataframe, we need to handle the logic # of msisdn_from/msisdn_to calculated_subscriber_set = set(rog_df.subscriber) self.assertEqual(calculated_subscriber_set, set(self.subscriber_list)) calculated_subscriber_set = set(hl_df.subscriber) self.assertEqual(calculated_subscriber_set, set(self.subscriber_list))
def __init__( self, start, stop, modal_locations=None, statistic="avg", unit="km", hours="all", method="last", table="all", subscriber_identifier="msisdn", ignore_nulls=True, subscriber_subset=None, ): # need to subtract one day from hl end in order to be # comparing over same period... self.stop_sl = stop self.stop_hl = str(parse_datestring(stop) - relativedelta(days=1)) self.start = start if modal_locations: if isinstance(modal_locations, ModalLocation): hl = modal_locations else: raise ValueError( "Argument 'modal_locations' should be an instance of ModalLocation class" ) hl.spatial_unit.verify_criterion("has_lon_lat_columns") else: hl = ModalLocation(*[ daily_location( date, spatial_unit=make_spatial_unit("lon-lat"), hours=hours, method=method, table=table, subscriber_identifier=subscriber_identifier, ignore_nulls=ignore_nulls, subscriber_subset=subscriber_subset, ) for date in list_of_dates(self.start, self.stop_hl) ]) sl = SubscriberLocations( self.start, self.stop_sl, spatial_unit=make_spatial_unit("lon-lat"), hours=hours, table=table, subscriber_identifier=subscriber_identifier, ignore_nulls=ignore_nulls, subscriber_subset=subscriber_subset, ) self.statistic = statistic.lower() if self.statistic not in valid_stats: raise ValueError( "{} is not a valid statistic. Use one of {}".format( self.statistic, valid_stats)) self.joined = hl.join( sl, on_left="subscriber", on_right="subscriber", how="left", left_append="_home_loc", right_append="", ) self.unit = unit super().__init__()