def test_stores_table(flowmachine_connect): """ EventTableSubset.to_sql() can be stored as a TABLE. """ query = EventTableSubset(start="2016-01-01", stop="2016-01-01 01:00:00") query.to_sql(name="test_table", schema="tests").result() assert flowmachine_connect.has_table(name="test_table", schema="tests")
def test_avoids_searching_extra_tables(get_dataframe): """ EventTableSubset query doesn't look in additional partitioned tables. """ sd = EventTableSubset(start="2016-01-01", stop="2016-01-02") explain_string = sd.explain() assert "calls_20160103" not in explain_string
def test_error_on_all_missing(): """ Date subsetter should error when all dates are missing. """ with pytest.raises(MissingDateError): EventTableSubset("2016-05-01", "2016-05-02") with pytest.raises(MissingDateError): EventTableSubset("2016-05-01", "2016-05-02", table="events.topups")
def test_stores_table(flowmachine_connect): """ EventTableSubset().to_sql() can be stored as a TABLE. """ query = EventTableSubset("2016-01-01", "2016-01-01 01:00:00") query.to_sql(name="test_table", schema="tests").result() assert "test_table" in flowmachine_connect.inspector.get_table_names( schema="tests")
def test_explain(get_dataframe): """ EventTableSubset.explain() method returns a string """ # Usually not a critical function, so let's simply test by # asserting that it returns a string sd = EventTableSubset(start="2016-01-01", stop="2016-01-02") explain_string = sd.explain() assert isinstance(explain_string, str) assert isinstance(sd.explain(analyse=True), str)
def test_warns_on_missing(): """ Date subsetter should warn on missing dates. """ message = "115 of 122 calendar dates missing. Earliest date is 2016-01-01, latest is 2016-01-07" with pytest.warns(UserWarning, match=message): EventTableSubset(start="2016-01-01", stop="2016-05-02")
def test_default_dates(get_dataframe): """ Test whether not passing a start and/or stop date will default to the min and/or max dates in the table. """ sd = EventTableSubset(start=None, stop="2016-01-04") df = get_dataframe(sd) minimum = df["datetime"].min().to_pydatetime() min_comparison = pytz.timezone("Etc/UTC").localize( datetime(2016, 1, 1, 0, 0, 0)) assert minimum.timestamp() > min_comparison.timestamp() sd = EventTableSubset(start="2016-01-04", stop=None, hours=(20, 5)) df = get_dataframe(sd) maximum = df["datetime"].max().to_pydatetime() max_comparison = pytz.timezone("Etc/UTC").localize( datetime(2016, 1, 8, 0, 0, 0)) assert maximum.timestamp() < max_comparison.timestamp()
def test_can_force_rewrite(flowmachine_connect, get_length): """ Test that we can force the rewrite of a test to the database. """ query = EventTableSubset(start="2016-01-01", stop="2016-01-01 01:00:00") query.to_sql(name="test_rewrite", schema="tests").result() # We're going to delete everything from the table, then # force a rewrite, and check that the table now has data. sql = """DELETE FROM tests.test_rewrite""" flowmachine_connect.engine.execute(sql) assert 0 == get_length(Table("tests.test_rewrite")) query.invalidate_db_cache(name="test_rewrite", schema="tests") query.to_sql(name="test_rewrite", schema="tests").result() assert 1 < get_length(Table("tests.test_rewrite"))
def test_can_subset_by_hour(get_dataframe): """ EventTableSubset() can subset by a range of hours """ sd = EventTableSubset("2016-01-01", "2016-01-04", hours=(12, 17)) df = get_dataframe(sd) df["hour"] = df.datetime.apply(lambda x: x.hour) df["day"] = df.datetime.apply(lambda x: x.day) Range = df.hour.max() - df.hour.min() assert 4 == Range # Also check that all the dates are still there assert 3 in df.day assert 2 in df.day assert 1 in df.day
def test_handles_dates(get_dataframe): """ Date subsetter can handle timestamp without hours or mins. """ sd = EventTableSubset(start="2016-01-01", stop="2016-01-02") df = get_dataframe(sd) minimum = df["datetime"].min().to_pydatetime() maximum = df["datetime"].max().to_pydatetime() min_comparison = pytz.timezone("Etc/UTC").localize(datetime(2016, 1, 1)) max_comparison = pytz.timezone("Etc/UTC").localize(datetime(2016, 1, 2)) assert minimum.timestamp() > min_comparison.timestamp() assert maximum.timestamp() < max_comparison.timestamp()
def test_handles_backwards_hours(get_dataframe): """ If the subscriber passes hours that are 'backwards' this will be interpreted as spanning midnight. """ sd = EventTableSubset(start="2016-01-01", stop="2016-01-04", hours=(20, 5)) df = get_dataframe(sd) df["hour"] = df.datetime.apply(lambda x: x.hour) df["day"] = df.datetime.apply(lambda x: x.day) unique_hours = list(df.hour.unique()) unique_hours.sort() assert [0, 1, 2, 3, 4, 20, 21, 22, 23] == unique_hours # Also check that all the dates are still there assert 3 in df.day assert 2 in df.day assert 1 in df.day
def test_handles_mins(get_dataframe): """ Date subsetter can handle timestamps including the times. """ sd = EventTableSubset("2016-01-01 13:30:30", "2016-01-02 16:25:00") df = get_dataframe(sd) minimum = df["datetime"].min().to_pydatetime() maximum = df["datetime"].max().to_pydatetime() min_comparison = pytz.timezone("Etc/UTC").localize( datetime(2016, 1, 1, 13, 30, 30)) max_comparison = pytz.timezone("Etc/UTC").localize( datetime(2016, 1, 2, 16, 25, 0)) assert minimum.timestamp() > min_comparison.timestamp() assert maximum.timestamp() < max_comparison.timestamp()
def test_head_has_column_names(get_dataframe): """ Returning the dataframe gives the expected column names. """ sd = EventTableSubset(start="2016-01-01", stop="2016-01-02") assert [ "country_code", "datetime", "duration", "id", "imei", "imsi", "location_id", "subscriber", "msisdn_counterpart", "network", "operator_code", "outgoing", "tac", ] == get_dataframe(sd).columns.tolist()
def test_error_on_start_is_stop(get_dataframe): """Test that a value error is raised when start == stop""" with pytest.raises(ValueError): EventTableSubset(start="2016-01-01", stop="2016-01-01")