Ejemplo n.º 1
0
 def test_avoids_searching_extra_tables(self):
     """
     EventTableSubset() query doesn't look in additional partitioned tables.
     """
     sd = EventTableSubset("2016-01-01", "2016-01-02")
     explain_string = sd.explain()
     self.assertNotIn("calls_20160103", explain_string)
Ejemplo n.º 2
0
    def test_cdrs_can_be_subset_by_table(self):
        """
        We can subset CDRs by a table in the database.
        """

        # Create a temporary table in the DB
        con = Table.connection.engine

        sql = "DROP TABLE IF EXISTS subscriber_list"
        con.execute(sql)

        sql = """CREATE TABLE subscriber_list (subscriber TEXT)"""
        con.execute(sql)

        formatted_subscribers = ",".join("('{}')".format(u)
                                         for u in self.subscriber_list)
        sql = """INSERT INTO subscriber_list (subscriber) VALUES {}""".format(
            formatted_subscribers)
        con.execute(sql)
        su = EventTableSubset("2016-01-01",
                              "2016-01-03",
                              subscriber_subset=Table("subscriber_list"))

        df = su.get_dataframe()
        sql = "DROP TABLE IF EXISTS subscriber_list"
        con.execute(sql)
        # Get the set of subscribers present in the dataframe, we need to handle the logic
        # of msisdn_from/msisdn_to
        calculated_subscriber_set = set(df.subscriber)

        self.assertEqual(calculated_subscriber_set, set(self.subscriber_list))
Ejemplo n.º 3
0
def test_stores_view(flowmachine_connect):
    """
    EventTableSubset().to_sql() can be stored as a VIEW.
    """
    query = EventTableSubset("2016-01-01", "2016-01-01 01:00:00")
    query.to_sql(schema="tests", name="test_view", as_view=True)
    assert "test_view" in flowmachine_connect.inspector.get_view_names(schema="tests")
Ejemplo n.º 4
0
def test_events_table_subset_column_names(columns):
    """Test that EventTableSubset column_names property is accurate."""
    etu = EventTableSubset("2016-01-01",
                           "2016-01-02",
                           columns=columns,
                           tables=["events.calls"])
    assert etu.head(0).columns.tolist() == etu.column_names
Ejemplo n.º 5
0
def test_avoids_searching_extra_tables(get_dataframe):
    """
    EventTableSubset() query doesn't look in additional partitioned tables.
    """
    sd = EventTableSubset("2016-01-01", "2016-01-02")
    explain_string = sd.explain()
    assert "calls_20160103" not in explain_string
Ejemplo n.º 6
0
 def test_dataframe_has_column_names(self):
     """
     Returning the dataframe gives the expected column names.
     """
     sd = EventTableSubset("2016-01-01", "2016-01-02")
     df = sd.get_dataframe()
     self.assertEqual(sorted(df.columns), self.expected_columns)
Ejemplo n.º 7
0
 def test_head_has_column_names(self):
     """
     Returning the head of the dataframe gives the expected column names.
     """
     sd = EventTableSubset("2016-01-01", "2016-01-02")
     head = sd.head()
     self.assertEqual(sorted(head.columns), self.expected_columns)
Ejemplo n.º 8
0
def test_stores_table(flowmachine_connect):
    """
    EventTableSubset().to_sql() can be stored as a TABLE.
    """
    query = EventTableSubset("2016-01-01", "2016-01-01 01:00:00")
    query.to_sql(schema="tests", name="test_table")
    assert "test_table" in flowmachine_connect.inspector.get_table_names(schema="tests")
Ejemplo n.º 9
0
def test_error_on_all_missing():
    """
    Date subsetter should error when all dates are missing.
    """
    with pytest.raises(MissingDateError):
        EventTableSubset("2016-05-01", "2016-05-02")
    with pytest.raises(MissingDateError):
        EventTableSubset("2016-05-01", "2016-05-02", table="events.topups")
Ejemplo n.º 10
0
 def test_error_on_all_missing(self):
     """
     Date subsetter should error when all dates are missing.
     """
     with self.assertRaises(MissingDateError):
         sd = EventTableSubset("2016-05-01", "2016-05-02")
     with self.assertRaises(MissingDateError):
         sd = EventTableSubset("2016-05-01",
                               "2016-05-02",
                               table="events.topups")
Ejemplo n.º 11
0
    def test_explain(self):
        """
        EventTableSubset().explain() method returns a string
        """

        # Usually not a critical function, so let's simply test by
        # asserting that it returns a string
        sd = EventTableSubset("2016-01-01", "2016-01-02")
        explain_string = sd.explain()
        self.assertIs(type(explain_string), str)
        self.assertIs(type(sd.explain(analyse=True)), str)
Ejemplo n.º 12
0
def test_explain(get_dataframe):
    """
    EventTableSubset().explain() method returns a string
    """

    # Usually not a critical function, so let's simply test by
    # asserting that it returns a string
    sd = EventTableSubset("2016-01-01", "2016-01-02")
    explain_string = sd.explain()
    assert isinstance(explain_string, str)
    assert isinstance(sd.explain(analyse=True), str)
Ejemplo n.º 13
0
def test_events_table_subscriber_ident_substitutions(ident):
    """Test that EventTableSubset replaces the subscriber ident column name with subscriber."""
    etu = EventTableSubset(
        "2016-01-01",
        "2016-01-02",
        columns=[ident],
        tables=["events.calls"],
        subscriber_identifier=ident,
    )
    assert "subscriber" == etu.head(0).columns[0]
    assert ["subscriber"] == etu.column_names
Ejemplo n.º 14
0
 def test_can_subset_by_hour(self):
     """
     EventTableSubset() can subset by a range of hours
     """
     sd = EventTableSubset("2016-01-01", "2016-01-04", hours=(12, 17))
     df = sd.get_dataframe()
     df["hour"] = df.datetime.apply(lambda x: x.hour)
     df["day"] = df.datetime.apply(lambda x: x.day)
     Range = df.hour.max() - df.hour.min()
     self.assertEqual(Range, 4)
     # Also check that all the dates are still there
     self.assertTrue(3 in df.day and 2 in df.day and 1 in df.day)
Ejemplo n.º 15
0
 def test_can_subset_by_sampler(self):
     """Test that we can use the output of another query to subset by."""
     unique_subs_sample = UniqueSubscribers(
         "2016-01-01", "2016-01-07").random_sample(size=10,
                                                   method="system",
                                                   seed=0.1)
     su = EventTableSubset("2016-01-01",
                           "2016-01-03",
                           subscriber_subset=unique_subs_sample)
     su_set = set(su.get_dataframe().subscriber)
     uu_set = set(unique_subs_sample.get_dataframe().subscriber)
     self.assertSetEqual(su_set, uu_set)
     self.assertEqual(len(su_set), 10)
Ejemplo n.º 16
0
 def test_handles_backwards_dates(self):
     """
     If the subscriber passes dates that are 'backwards' this will be interpreted as spanning midnight.
     """
     sd = EventTableSubset("2016-01-01", "2016-01-04", hours=(20, 5))
     df = sd.get_dataframe()
     df["hour"] = df.datetime.apply(lambda x: x.hour)
     df["day"] = df.datetime.apply(lambda x: x.day)
     unique_hours = list(df.hour.unique())
     unique_hours.sort()
     self.assertEqual([0, 1, 2, 3, 4, 20, 21, 22, 23], unique_hours)
     # Also check that all the dates are still there
     self.assertTrue(3 in df.day and 2 in df.day and 1 in df.day)
Ejemplo n.º 17
0
    def test_cdrs_can_be_subset_by_list(self):
        """
        We can subset CDRs with a list.
        """

        su = EventTableSubset("2016-01-01",
                              "2016-01-03",
                              subscriber_subset=self.subscriber_list)
        df = su.get_dataframe()

        # Get the set of subscribers present in the dataframe, we need to handle the logic
        # of msisdn_from/msisdn_to
        calculated_subscriber_set = set(df.subscriber)

        self.assertEqual(calculated_subscriber_set, set(self.subscriber_list))
Ejemplo n.º 18
0
def test_warns_on_missing():
    """
    Date subsetter should warn on missing dates.
    """
    message = "115 of 122 calendar dates missing. Earliest date is 2016-01-01, latest is 2016-01-07"
    with pytest.warns(UserWarning, match=message):
        EventTableSubset("2016-01-01", "2016-05-02")
Ejemplo n.º 19
0
 def test_warns_on_missing(self):
     """
     Date subsetter should warn on missing dates.
     """
     message = "115 of 122 calendar dates missing. Earliest date is 2016-01-01, latest is 2016-01-07"
     with self.assertWarnsRegex(UserWarning, message):
         EventTableSubset("2016-01-01", "2016-05-02")
Ejemplo n.º 20
0
    def test_handles_dates(self):
        """
        Date subsetter can handle timestamp without hours or mins.
        """
        sd = EventTableSubset("2016-01-01", "2016-01-02")
        df = sd.get_dataframe()

        minimum = df["datetime"].min().to_pydatetime()
        maximum = df["datetime"].max().to_pydatetime()

        min_comparison = pytz.timezone("Etc/UTC").localize(datetime(
            2016, 1, 1))
        max_comparison = pytz.timezone("Etc/UTC").localize(datetime(
            2016, 1, 2))

        self.assertTrue(minimum.timestamp() > min_comparison.timestamp())
        self.assertTrue(maximum.timestamp() < max_comparison.timestamp())
Ejemplo n.º 21
0
    def test_handles_mins(self):
        """
        Date subsetter can handle timestamps including the times.
        """
        sd = EventTableSubset("2016-01-01 13:30:30", "2016-01-02 16:25:00")
        df = sd.get_dataframe()

        minimum = df["datetime"].min().to_pydatetime()
        maximum = df["datetime"].max().to_pydatetime()

        min_comparison = pytz.timezone("Etc/UTC").localize(
            datetime(2016, 1, 1, 13, 30, 30))
        max_comparison = pytz.timezone("Etc/UTC").localize(
            datetime(2016, 1, 2, 16, 25, 0))

        self.assertTrue(minimum.timestamp() > min_comparison.timestamp())
        self.assertTrue(maximum.timestamp() < max_comparison.timestamp())
Ejemplo n.º 22
0
 def test_subset_correct(self):
     """Test that pushed in subsetting matches .subset result"""
     su = EventTableSubset("2016-01-01",
                           "2016-01-03",
                           subscriber_subset=self.subscriber_list)
     subsu = EventTableSubset("2016-01-01",
                              "2016-01-03").subset("subscriber",
                                                   self.subscriber_list)
     self.assertTrue(all(su.get_dataframe() == subsu.get_dataframe()))
     su = HomeLocation(*[
         daily_location(d, subscriber_subset=self.subscriber_list)
         for d in list_of_dates("2016-01-01", "2016-01-07")
     ])
     subsu = HomeLocation(*[
         daily_location(d)
         for d in list_of_dates("2016-01-01", "2016-01-03")
     ]).subset("subscriber", self.subscriber_list)
     self.assertTrue(all(su.get_dataframe() == subsu.get_dataframe()))
Ejemplo n.º 23
0
 def test_ommitted_subscriber_column(self):
     """Test that a result is returned and warning is raised when ommitting a subscriber column."""
     with self.assertWarns(UserWarning):
         su_omit_col = EventTableSubset(
             "2016-01-01",
             "2016-01-03",
             subscriber_subset=self.subscriber_list,
             columns=["duration"],
         ).get_dataframe()
     su_all_cols = EventTableSubset(
         "2016-01-01",
         "2016-01-03",
         subscriber_subset=self.subscriber_list,
         columns=["msisdn", "duration"],
     ).get_dataframe()
     self.assertListEqual(su_omit_col.duration.values.tolist(),
                          su_all_cols.duration.values.tolist())
     self.assertListEqual(su_omit_col.columns.tolist(), ["duration"])
Ejemplo n.º 24
0
def test_subset_correct(subscriber_list, get_dataframe):
    """Test that pushed in subsetting matches .subset result"""
    su = EventTableSubset("2016-01-01",
                          "2016-01-03",
                          subscriber_subset=subscriber_list)
    subsu = EventTableSubset("2016-01-01",
                             "2016-01-03").subset("subscriber",
                                                  subscriber_list)
    assert all(get_dataframe(su) == get_dataframe(subsu))
    su = ModalLocation(*[
        daily_location(d, subscriber_subset=subscriber_list)
        for d in list_of_dates("2016-01-01", "2016-01-07")
    ])
    subsu = ModalLocation(
        *
        [daily_location(d)
         for d in list_of_dates("2016-01-01", "2016-01-03")]).subset(
             "subscriber", subscriber_list)
    assert all(get_dataframe(su) == get_dataframe(subsu))
Ejemplo n.º 25
0
def test_omitted_subscriber_column(get_dataframe, subscriber_list):
    """Test that a result is returned and warning is raised when omitting a subscriber column."""
    with pytest.warns(UserWarning):
        su_omit_col = get_dataframe(
            EventTableSubset(
                "2016-01-01",
                "2016-01-03",
                subscriber_subset=subscriber_list,
                columns=["duration"],
            ))
    su_all_cols = get_dataframe(
        EventTableSubset(
            "2016-01-01",
            "2016-01-03",
            subscriber_subset=subscriber_list,
            columns=["msisdn", "duration"],
        ))
    assert su_omit_col.duration.values.tolist(
    ) == su_all_cols.duration.values.tolist()
    assert su_omit_col.columns.tolist() == ["duration"]
Ejemplo n.º 26
0
def test_default_dates(get_dataframe):
    """
    Test whether not passing a start and/or stop date will
    default to the min and/or max dates in the table.
    """
    sd = EventTableSubset(None, "2016-01-04")
    df = get_dataframe(sd)

    minimum = df["datetime"].min().to_pydatetime()
    min_comparison = pytz.timezone("Etc/UTC").localize(
        datetime(2016, 1, 1, 0, 0, 0))
    assert minimum.timestamp() > min_comparison.timestamp()

    sd = EventTableSubset("2016-01-04", None, hours=(20, 5))
    df = get_dataframe(sd)

    maximum = df["datetime"].max().to_pydatetime()
    max_comparison = pytz.timezone("Etc/UTC").localize(
        datetime(2016, 1, 8, 0, 0, 0))
    assert maximum.timestamp() < max_comparison.timestamp()
Ejemplo n.º 27
0
def test_can_subset_by_sampler(get_dataframe):
    """Test that we can use the output of another query to subset by."""
    unique_subs_sample = UniqueSubscribers(
        "2016-01-01", "2016-01-07").random_sample(size=10,
                                                  method="system",
                                                  seed=0.1)
    su = EventTableSubset("2016-01-01",
                          "2016-01-03",
                          subscriber_subset=unique_subs_sample)
    su_set = set(get_dataframe(su).subscriber)
    uu_set = set(get_dataframe(unique_subs_sample).subscriber)
    assert su_set == uu_set
    assert len(su_set) == 10
Ejemplo n.º 28
0
def test_can_subset_by_hour(get_dataframe):
    """
    EventTableSubset() can subset by a range of hours
    """
    sd = EventTableSubset("2016-01-01", "2016-01-04", hours=(12, 17))
    df = get_dataframe(sd)
    df["hour"] = df.datetime.apply(lambda x: x.hour)
    df["day"] = df.datetime.apply(lambda x: x.day)
    Range = df.hour.max() - df.hour.min()
    assert 4 == Range
    # Also check that all the dates are still there
    assert 3 in df.day
    assert 2 in df.day
    assert 1 in df.day
Ejemplo n.º 29
0
def test_handles_backwards_hours(get_dataframe):
    """
    If the subscriber passes hours that are 'backwards' this will be interpreted as spanning midnight.
    """
    sd = EventTableSubset("2016-01-01", "2016-01-04", hours=(20, 5))
    df = get_dataframe(sd)
    df["hour"] = df.datetime.apply(lambda x: x.hour)
    df["day"] = df.datetime.apply(lambda x: x.day)
    unique_hours = list(df.hour.unique())
    unique_hours.sort()
    assert [0, 1, 2, 3, 4, 20, 21, 22, 23] == unique_hours
    # Also check that all the dates are still there
    assert 3 in df.day
    assert 2 in df.day
    assert 1 in df.day
Ejemplo n.º 30
0
def test_cdrs_can_be_subset_by_table(subscriber_list_table, get_dataframe,
                                     subscriber_list):
    """
    We can subset CDRs by a table in the database.
    """

    su = EventTableSubset("2016-01-01",
                          "2016-01-03",
                          subscriber_subset=subscriber_list_table)

    df = get_dataframe(su)

    # Get the set of subscribers present in the dataframe, we need to handle the logic
    # of msisdn_from/msisdn_to
    calculated_subscriber_set = set(df.subscriber)

    assert calculated_subscriber_set == set(subscriber_list)