def test_table_parent(): """ Check that creating a table which contains only some columns depends on the main table. """ t = Table("events.calls", columns=["id"]) assert t.dependencies.pop().md5 == Table("events.calls").md5
def test_subsetnumeric_subsetnumeric(get_dataframe): """ This test applies two numeric subsets one after the other """ sub_cola = "shape_star" sub_lowa = 0.1 sub_higha = 0.2 sub_colb = "shape_leng" sub_lowb = 1.0 sub_highb = 2.0 t = Table("geography.admin3") t_df = get_dataframe(t) sub_q = t.numeric_subset(sub_cola, sub_lowa, sub_lowb).numeric_subset(sub_colb, sub_lowb, sub_highb) sub_df = t_df[(sub_lowa <= t_df[sub_cola]) & (t_df[sub_cola] <= sub_higha) & (sub_lowb <= t_df[sub_colb]) & (t_df[sub_colb] <= sub_highb)] sub_df = sub_df.reset_index(drop=True) assert get_dataframe(sub_q).equals(sub_df)
def test_children(): """ Test that table inheritance is correctly detected. """ assert Table("events.calls").has_children() assert not Table("geography.admin3").has_children()
def test_subset_subsetnumeric(get_dataframe): """ This test applies a non-numeric subsets and a numeric subset one after another in both possible orders. """ sub_cola = "admin1name" sub_vala = "Central Development Region" sub_colb = "shape_area" sub_lowb = 0.1 sub_highb = 0.12 t = Table("geography.admin3") t_df = get_dataframe(t) sub_q1 = t.subset(sub_cola, sub_vala).numeric_subset(sub_colb, sub_lowb, sub_highb) sub_q2 = t.numeric_subset(sub_colb, sub_lowb, sub_highb).subset(sub_cola, sub_vala) sub_df = t_df[(t_df[sub_cola] == sub_vala) & (sub_lowb <= t_df[sub_colb]) & (t_df[sub_colb] <= sub_highb)] sub_df = sub_df.reset_index(drop=True) assert get_dataframe(sub_q1).equals(sub_df) assert get_dataframe(sub_q2).equals(sub_df)
def test_join_column_names(join_type): """Test that join column_names attribute is correct""" t = Table("events.calls_20160101") joined = t.join(t, on_left="msisdn", how=join_type) cols = t.column_names cols.remove("msisdn") expected = ["msisdn"] + cols + [f"{c}" for c in cols] assert expected == joined.column_names
def test_pickling(): """ Test that we can pickle and unpickle subset classes. """ ss = Table("events.calls").subset( "id", ["5wNJA-PdRJ4-jxEdG-yOXpZ", "5wNJA-PdRJ4-jxEdG-yOXpZ"]) assert ss.get_query() == pickle.loads(pickle.dumps(ss)).get_query() assert ss.query_id == pickle.loads(pickle.dumps(ss)).query_id
def test_union(get_dataframe): """ Test union with all set to false dedupes. """ q1 = Table(schema="events", name="calls") union = q1.union(q1, all=False) union_df = get_dataframe(union) single_id = union_df[union_df.id == "5wNJA-PdRJ4-jxEdG-yOXpZ"] assert len(single_id) == 2
def test_union_all(get_dataframe): """ Test default union behaviour keeps duplicates. """ q1 = Table(schema="events", name="calls") union_all = q1.union(q1) union_all_df = get_dataframe(union_all) single_id = union_all_df[union_all_df.id == "5wNJA-PdRJ4-jxEdG-yOXpZ"] assert len(single_id) == 4
def test_store_with_table(): """ Test that a subset of a table can be stored. """ t = Table("events.calls") s = t.subset("id", ["5wNJA-PdRJ4-jxEdG-yOXpZ", "5wNJA-PdRJ4-jxEdG-yOXpZ"]) s.store().result() assert s.is_stored t.invalidate_db_cache() assert not s.is_stored assert t.is_stored
def test_dependencies(): """ Check that a table without explicit columns has no other queries as a dependency, and a table with explicit columns has its parent table as a dependency. """ t1 = Table("events.calls") assert t1.dependencies == set() t2 = Table("events.calls", columns=["id"]) assert len(t2.dependencies) == 1 t2_parent = t2.dependencies.pop() assert "057addedac04dbeb1dcbbb6b524b43f0" == t2_parent.query_id
def test_can_force_rewrite(flowmachine_connect, get_length): """ Test that we can force the rewrite of a test to the database. """ query = EventTableSubset("2016-01-01", "2016-01-01 01:00:00") query.to_sql(schema="tests", name="test_rewrite") # We're going to delete everything from the table, then # force a rewrite, and check that the table now has data. sql = """DELETE FROM tests.test_rewrite""" flowmachine_connect.engine.execute(sql) assert 0 == get_length(Table("tests.test_rewrite")) query.to_sql(schema="tests", name="test_rewrite", force=True) assert 1 < get_length(Table("tests.test_rewrite"))
def test_join_column_names(join_type): """Test that join column_names attribute is correct""" t = Table("events.calls_20160101", columns=["location_id", "datetime"]) t2 = Table("infrastructure.cells", columns=["id", "geom_point"]) joined = t.join(t2, on_left="location_id", on_right="id", how=join_type) expected = [ "location_id" if join_type in ("left", "inner", "left outer") else "id", "datetime", "geom_point", ] assert joined.column_names == expected
def subscriber_list_table(subscriber_list, flowmachine_connect): engine = flowmachine_connect.engine with engine.begin(): sql = """CREATE TABLE subscriber_list (subscriber TEXT)""" engine.execute(sql) formatted_subscribers = ",".join("('{}')".format(u) for u in subscriber_list) sql = """INSERT INTO subscriber_list (subscriber) VALUES {}""".format( formatted_subscribers) engine.execute(sql) subs_table = Table("subscriber_list") yield subs_table subs_table.invalidate_db_cache(drop=True)
def test_table_init(): """ Test that table creation handles params properly. """ t = Table("events.calls") with pytest.raises(ValueError): Table("events.calls", "moose") with pytest.raises(ValueError): Table("events.calls", columns="NO SUCH COLUMN") with pytest.raises(ValueError): Table("NOSUCHTABLE") with pytest.raises(ValueError): Table("events.WHAAAAAAAAT")
def test_can_force_rewrite(flowmachine_connect, get_length): """ Test that we can force the rewrite of a test to the database. """ query = EventTableSubset(start="2016-01-01", stop="2016-01-01 01:00:00") query.to_sql(name="test_rewrite", schema="tests").result() # We're going to delete everything from the table, then # force a rewrite, and check that the table now has data. sql = """DELETE FROM tests.test_rewrite""" get_db().engine.execute(sql) assert 0 == get_length(Table("tests.test_rewrite")) query.invalidate_db_cache(name="test_rewrite", schema="tests") query.to_sql(name="test_rewrite", schema="tests").result() assert 1 < get_length(Table("tests.test_rewrite"))
def test_subscribers_who_make_atleast_3_calls_in_central_development_region(): """ Test that we can find subsets for multiple geometries at same time. Will find subscribers who have made at least 2 calls in any of the admin2 regions within Central Development admin1 region. """ start, stop = "2016-01-01", "2016-01-07" regions = Table("admin2", "geography").subset( "admin1name", ["Central Development Region"] ) sls = SubscriberLocationSubset( start, stop, min_calls=2, level="polygon", column_name="admin2pcod", polygon_table=regions, ) df = sls.get_dataframe() # we have results for multiple regions assert len(df.admin2pcod.unique()) > 1 # some users should have have made at least 2 calls in more than one region # and should therefore appear twice assert len(df[df.duplicated("subscriber")]) > 0
def test_cdrs_can_be_subset_by_table(self): """ We can subset CDRs by a table in the database. """ # Create a temporary table in the DB con = Table.connection.engine sql = "DROP TABLE IF EXISTS subscriber_list" con.execute(sql) sql = """CREATE TABLE subscriber_list (subscriber TEXT)""" con.execute(sql) formatted_subscribers = ",".join("('{}')".format(u) for u in self.subscriber_list) sql = """INSERT INTO subscriber_list (subscriber) VALUES {}""".format( formatted_subscribers) con.execute(sql) su = EventTableSubset("2016-01-01", "2016-01-03", subscriber_subset=Table("subscriber_list")) df = su.get_dataframe() sql = "DROP TABLE IF EXISTS subscriber_list" con.execute(sql) # Get the set of subscribers present in the dataframe, we need to handle the logic # of msisdn_from/msisdn_to calculated_subscriber_set = set(df.subscriber) self.assertEqual(calculated_subscriber_set, set(self.subscriber_list))
def test_subset(): """ Test that a subset of a table doesn't show as stored. """ ss = Table("events.calls").subset( "id", ["5wNJA-PdRJ4-jxEdG-yOXpZ", "5wNJA-PdRJ4-jxEdG-yOXpZ"]) assert not ss.is_stored
def test_touch_cache_record_for_table(flowmachine_connect): """ Touching a cache record for a table should update access count and last accessed but not touch score, or counter. """ table = Table("events.calls_20160101") flowmachine_connect.engine.execute( f"UPDATE cache.cached SET compute_time = 1 WHERE query_id=%s", table.query_id ) # Compute time for tables is zero, so set to 1 to avoid zeroing out assert 0 == get_score(flowmachine_connect, table.query_id) assert (1 == flowmachine_connect.fetch( f"SELECT access_count FROM cache.cached WHERE query_id='{table.query_id}'" )[0][0]) accessed_at = flowmachine_connect.fetch( f"SELECT last_accessed FROM cache.cached WHERE query_id='{table.query_id}'" )[0][0] touch_cache(flowmachine_connect, table.query_id) assert 0 == get_score(flowmachine_connect, table.query_id) assert (2 == flowmachine_connect.fetch( f"SELECT access_count FROM cache.cached WHERE query_id='{table.query_id}'" )[0][0]) # No cache touch should be recorded assert (2 == flowmachine_connect.fetch( "SELECT nextval('cache.cache_touches');")[0][0]) assert (accessed_at < flowmachine_connect.fetch( f"SELECT last_accessed FROM cache.cached WHERE query_id='{table.query_id}'" )[0][0])
def __init__(self, table="sites", date=None): """ Parameters ---------- table: str, default 'sites' Which table to collection versioned information from. Only the tables infrastructure.sites and infrastructure.cells are supported. date: str, default None The date to collect a valid version from. This date must be formatted using ISO standards (2016-01-13). If no date is passed the current date will be used. """ if table not in ("sites", "cells"): raise ValueError("Only the tables infrastructure.sites and " + "and infrastructure.cells are supported.") if date == None: date = datetime.now().strftime("%Y-%m-%d") self.table = Table(schema="infrastructure", name=table) self.date = date super().__init__()
def get_sql_for_query_id(query_id): """ Return the SQL which, when run against flowdb, will return the result for the query with the given id. Parameters ---------- query_id : str The query id Returns ------- str """ q = Table(f"x{query_id}", "cache") sql = q.get_query() return sql
def test_subset_subset(get_dataframe): """ This test applies two non-numeric subsets one after the other . """ sub_cola = "admin1name" sub_vala = "Central Development Region" sub_colb = "admin2name" sub_valb = "Bagmati" t = Table("geography.admin3") t_df = get_dataframe(t) sub_q = t.subset(sub_cola, sub_vala).subset(sub_colb, sub_valb) sub_df = t_df[(t_df[sub_cola] == sub_vala) & (t_df[sub_colb] == sub_valb)] sub_df = sub_df.reset_index(drop=True) assert get_dataframe(sub_q).equals(sub_df)
def test_raises_valueerror_when_grouping_element_not_provided(): """ RasterStatistics() raises ValueError when `grouping_element` not provided. """ G = None vector = Table(schema="geography", name="admin2") with pytest.raises(ValueError): r = RasterStatistics( "population.small_nepal_raster", vector=vector, grouping_element=None )
def test_raster_statistics_column_names_vector(self): """ Test that column_names property matches head(0) for RasterStatistics when vector is not None """ vector = Table(schema="public", name="gambia_admin2") r = RasterStatistics(raster="worldpop_gambia", vector=vector, grouping_element="district_c") assert r.head(0).columns.tolist() == r.column_names
def test_raster_statistics_column_names_vector(get_dataframe): """ Test that column_names property matches head(0) for RasterStatistics when vector is not None """ vector = Table(schema="geography", name="admin2") r = RasterStatistics( raster="population.small_nepal_raster", vector=vector, grouping_element="admin2pcod", ) assert get_dataframe(r).columns.tolist() == r.column_names
def test_raises_notimplemented_when_wrong_statistic_requested(): """ RasterStatistics() raises NotImplementedError if wrong statistic requested. """ vector = Table(schema="geography", name="admin2") with pytest.raises(NotImplementedError): G = "admin2pcod" r = RasterStatistics( raster="population.small_nepal_raster", vector=vector, grouping_element=G, statistic="foobar", )
def test_computes_expected_clipping_values(get_dataframe): """ RasterStatistics() returns correct values when clipping vector and raster layers. """ G = "admin2pcod" vector = Table(schema="geography", name="admin2") r = RasterStatistics( raster="population.small_nepal_raster", vector=vector, grouping_element=G ) result = get_dataframe(r) # Should have only _one_ entry assert 1 == len(result) assert "524 3 07" in result.admin2pcod.values assert 2500000.0 == result.set_index("admin2pcod").loc["524 3 07"][0]
def test_query_can_be_subscriber_set_restricted(self): """Test that some queries can be limited to only a subset of subscribers.""" # Create a temporary table in the DB con = Table.connection.engine sql = "DROP TABLE IF EXISTS subscriber_list" con.execute(sql) sql = """CREATE TABLE subscriber_list (subscriber TEXT)""" con.execute(sql) formatted_subscribers = ",".join("('{}')".format(u) for u in self.subscriber_list) sql = """INSERT INTO subscriber_list (subscriber) VALUES {}""".format( formatted_subscribers) con.execute(sql) rog = RadiusOfGyration("2016-01-01", "2016-01-03", subscriber_subset=Table("subscriber_list")) hl = HomeLocation(*[ daily_location(d, subscriber_subset=Table("subscriber_list")) for d in list_of_dates("2016-01-01", "2016-01-03") ]) rog_df = rog.get_dataframe() hl_df = hl.get_dataframe() sql = "DROP TABLE IF EXISTS subscriber_list" con.execute(sql) # Get the set of subscribers present in the dataframe, we need to handle the logic # of msisdn_from/msisdn_to calculated_subscriber_set = set(rog_df.subscriber) self.assertEqual(calculated_subscriber_set, set(self.subscriber_list)) calculated_subscriber_set = set(hl_df.subscriber) self.assertEqual(calculated_subscriber_set, set(self.subscriber_list))
def test_different_call_days_format(self): """ Test whether we can pass different call days format such as table name, SQL query and CallDays class. """ cd = CallDays("2016-01-01", "2016-01-04", level="versioned-site") har = HartiganCluster(cd, 50).get_dataframe() self.assertIsInstance(har, pd.DataFrame) cd.store().result() har = HartiganCluster(Table(cd.table_name), 50).get_dataframe() self.assertIsInstance(har, pd.DataFrame) cd_query = cd.get_query() har = HartiganCluster(CustomQuery(cd_query), 50).get_dataframe() self.assertIsInstance(har, pd.DataFrame)
def test_computes_expected_clipping_values(self): """ RasterStatistics() returns correct values when clipping vector and raster layers. """ G = "district_c" vector = Table(schema="public", name="gambia_admin2") r = RasterStatistics(raster="worldpop_gambia", vector=vector, grouping_element=G) result = r.get_dataframe() for expected_result in self.rasterio_results.to_dict("records"): self.assertAlmostEqual( int(result[result[G] == expected_result["district"]].statistic.iloc[0]), expected_result["value"], )