Example #1
0
def test_join_returns_the_same_clusters():
    """
    Test whether joining to another table for which the start and stop time are the same yields the same clusters.
    """
    cd = CallDays(
        SubscriberLocations("2016-01-01",
                            "2016-01-04",
                            spatial_unit=make_spatial_unit("versioned-site")))

    hartigan = HartiganCluster(calldays=cd, radius=50)
    har_df = hartigan.to_geopandas()
    es = EventScore(
        start="2016-01-01",
        stop="2016-01-04",
        spatial_unit=make_spatial_unit("versioned-site"),
    )

    joined = (
        hartigan.join_to_cluster_components(es).to_geopandas().sort_values(
            ["subscriber", "rank", "calldays"]))
    joined.reset_index(inplace=True, drop=True)

    har_df.sort_values(["subscriber", "rank", "calldays"], inplace=True)
    har_df.reset_index(inplace=True, drop=True)

    cols = ["subscriber", "geometry", "rank", "calldays"]
    compare = joined[cols] == har_df[cols]
    assert all(compare.all())
Example #2
0
def test_cluster_is_within_envelope(get_dataframe):
    """
    Test that all the clusters are within the enveloped formed by all the towers in the cluster.
    """
    cd = CallDays(
        SubscriberLocations("2016-01-01",
                            "2016-01-04",
                            spatial_unit=make_spatial_unit("versioned-site")))

    hartigan = HartiganCluster(calldays=cd, radius=50)
    har_df = hartigan.to_geopandas()
    sites = Sites().to_geopandas().set_index(["site_id", "version"])
    towers = GeoSeries(har_df.apply(lambda x: get_geom_point(x, sites), 1))
    s = har_df.intersects(towers)
    assert all(s)
Example #3
0
def test_call_threshold_works(get_dataframe):
    """
    Test whether a call threshold above 1 limits the number of clusters.
    """
    cd = CallDays(
        SubscriberLocations("2016-01-01",
                            "2016-01-04",
                            spatial_unit=make_spatial_unit("versioned-site")))

    hartigan = HartiganCluster(calldays=cd, radius=50)
    har_df = hartigan.to_geopandas()
    assert any(har_df.calldays == 1)
    har_df_higher_call_threshold = get_dataframe(
        HartiganCluster(calldays=cd, radius=50, call_threshold=2))
    assert len(har_df) > len(har_df_higher_call_threshold)
Example #4
0
def test_first_call_day_in_first_cluster(get_dataframe):
    """
    Test that the first ranked call day of each subscriber is in the first cluster of each subscriber.
    """
    cd = CallDays(
        SubscriberLocations("2016-01-01",
                            "2016-01-04",
                            spatial_unit=make_spatial_unit("versioned-site")))
    cd_df = get_dataframe(cd)

    hartigan = HartiganCluster(calldays=cd, radius=50)
    har_df = hartigan.to_geopandas()
    cd_first = cd_df[["subscriber", "site_id",
                      "version"]].groupby("subscriber").first()
    har_first = (har_df[["subscriber", "site_id",
                         "version"]].groupby("subscriber").first())

    joined = cd_first.join(har_first, lsuffix="_cd", rsuffix="_har")
    s = joined.apply(
        lambda x:
        (x.site_id_cd in x.site_id_har) and (x.version_cd in x.version_har),
        axis=1,
    )
    assert all(s)
Example #5
0
class TestHartiganCluster(TestCase):
    def setUp(self):
        self.cd = CallDays("2016-01-01", "2016-01-04", level="versioned-site")
        self.cd_df = self.cd.get_dataframe()

        self.hartigan = HartiganCluster(self.cd, 50)
        self.har_df = self.hartigan.to_geopandas()

    def tearDown(self):
        [cd.invalidate_db_cache() for cd in CallDays.get_stored()]

    def test_returns_dataframe(self):
        """
        Tests that it returns a dataframe with the correct columns
        """
        self.assertIsInstance(self.har_df, pd.DataFrame)
        self.assertEquals(
            set(self.har_df.columns),
            set([
                "subscriber",
                "geometry",
                "rank",
                "calldays",
                "site_id",
                "version",
                "centroid",
            ]),
        )

    def test_cluster_is_within_envelope(self):
        """
        Test that all the clusters are within the enveloped formed by all the towers in the cluster.
        """
        sites = Sites().to_geopandas()
        self.har_df["towers"] = self.har_df.apply(
            lambda x: get_geom_point(x, sites), 1)
        s = self.har_df.apply(
            lambda x: x.geometry.intersects(box(*x.towers.bounds)), axis=1)
        self.assertTrue(all(s))

    def test_first_call_day_in_first_cluster(self):
        """
        Test that the first ranked call day of each subscriber is in the first cluster of each subscriber.
        """
        cd_first = (self.cd_df[["subscriber", "site_id",
                                "version"]].groupby("subscriber").first())
        har_first = (self.har_df[["subscriber", "site_id",
                                  "version"]].groupby("subscriber").first())

        joined = cd_first.join(har_first, lsuffix="_cd", rsuffix="_har")
        s = joined.apply(
            lambda x: (x.site_id_cd in x.site_id_har) and
            (x.version_cd in x.version_har),
            axis=1,
        )
        self.assertTrue(all(s))

    def test_bigger_radius_yields_fewer_clusters(self):
        """
        Test whether bigger radius yields fewer clusters per subscriber
        """
        radius = [1, 2, 5, 10, 50]

        h = HartiganCluster(self.cd, radius[0]).get_dataframe()
        nclusters_small_radius = h.groupby("subscriber").size()

        for r in radius[1:]:
            h = HartiganCluster(self.cd, r).get_dataframe()
            nclusters_big_radius = h.groupby("subscriber").size()
            self.assertTrue(
                all(nclusters_small_radius >= nclusters_big_radius))
            nclusters_small_radius = nclusters_big_radius

    def test_different_call_days_format(self):
        """
        Test whether we can pass different call days format such as table name, SQL query and CallDays class.
        """
        cd = CallDays("2016-01-01", "2016-01-04", level="versioned-site")
        har = HartiganCluster(cd, 50).get_dataframe()
        self.assertIsInstance(har, pd.DataFrame)

        cd.store().result()

        har = HartiganCluster(Table(cd.table_name), 50).get_dataframe()
        self.assertIsInstance(har, pd.DataFrame)

        cd_query = cd.get_query()
        har = HartiganCluster(CustomQuery(cd_query), 50).get_dataframe()
        self.assertIsInstance(har, pd.DataFrame)

    def test_call_threshold_works(self):
        """
        Test whether a call threshold above 1 limits the number of clusters.
        """
        self.assertTrue(any(self.har_df.calldays == 1))
        har = HartiganCluster(self.cd, 50, call_threshold=2).get_dataframe()
        self.assertFalse(all(self.har_df.calldays > 1))

    def test_buffered_hartigan(self):
        """
        Test whether Hartigan produces buffered clusters when buffer is larger than 0.
        """
        har = HartiganCluster(self.cd, 50, buffer=2).to_geopandas()
        areas = har.geometry.area
        # since the mock data does not have geom_area in the site table we either
        # get the clusters with area equivalent to 2km2 (areas below are in degrees) or None.
        min_area = areas.min()
        max_area = areas.max()
        self.assertAlmostEquals(min_area, 0.001, 3)
        self.assertAlmostEquals(max_area, 0.001, 3)

    def test_all_options_hartigan(self):
        """
        Test whether Hartigan works when changing all options.
        """
        har = HartiganCluster(self.cd, 50, buffer=2,
                              call_threshold=2).to_geopandas()
        self.assertIsInstance(har, pd.DataFrame)
        self.assertEquals(
            set(har.columns),
            set([
                "subscriber",
                "geometry",
                "rank",
                "calldays",
                "site_id",
                "version",
                "centroid",
            ]),
        )

    def test_join_returns_the_same_clusters(self):
        """
        Test whether joining to another table for which the start and stop time are the same yields the same clusters.
        """
        es = EventScore(start="2016-01-01",
                        stop="2016-01-04",
                        level="versioned-site")

        joined = (self.hartigan.join_to_cluster_components(
            es).to_geopandas().sort_values(["subscriber", "rank", "calldays"]))
        joined.reset_index(inplace=True, drop=True)

        self.har_df.sort_values(["subscriber", "rank", "calldays"],
                                inplace=True)
        self.har_df.reset_index(inplace=True, drop=True)

        cols = ["subscriber", "geometry", "rank", "calldays"]
        compare = joined[cols] == self.har_df[cols]
        self.assertTrue(all(compare.all()))