Ejemplo n.º 1
0
def test_table_parent():
    """
    Check that creating a table which contains only some columns depends on the
    main table.
    """
    t = Table("events.calls", columns=["id"])
    assert t.dependencies.pop().md5 == Table("events.calls").md5
Ejemplo n.º 2
0
def test_subsetnumeric_subsetnumeric(get_dataframe):
    """
    This test applies two numeric subsets one
    after the other
    """

    sub_cola = "shape_star"
    sub_lowa = 0.1
    sub_higha = 0.2
    sub_colb = "shape_leng"
    sub_lowb = 1.0
    sub_highb = 2.0
    t = Table("geography.admin3")
    t_df = get_dataframe(t)

    sub_q = t.numeric_subset(sub_cola, sub_lowa,
                             sub_lowb).numeric_subset(sub_colb, sub_lowb,
                                                      sub_highb)
    sub_df = t_df[(sub_lowa <= t_df[sub_cola])
                  & (t_df[sub_cola] <= sub_higha)
                  & (sub_lowb <= t_df[sub_colb])
                  & (t_df[sub_colb] <= sub_highb)]
    sub_df = sub_df.reset_index(drop=True)

    assert get_dataframe(sub_q).equals(sub_df)
Ejemplo n.º 3
0
def test_children():
    """
    Test that table inheritance is correctly detected.
    """

    assert Table("events.calls").has_children()
    assert not Table("geography.admin3").has_children()
Ejemplo n.º 4
0
def test_subset_subsetnumeric(get_dataframe):
    """
    This test applies a non-numeric subsets and
    a numeric subset one after another in both possible
    orders.
    """

    sub_cola = "admin1name"
    sub_vala = "Central Development Region"
    sub_colb = "shape_area"
    sub_lowb = 0.1
    sub_highb = 0.12
    t = Table("geography.admin3")
    t_df = get_dataframe(t)

    sub_q1 = t.subset(sub_cola,
                      sub_vala).numeric_subset(sub_colb, sub_lowb, sub_highb)
    sub_q2 = t.numeric_subset(sub_colb, sub_lowb,
                              sub_highb).subset(sub_cola, sub_vala)
    sub_df = t_df[(t_df[sub_cola] == sub_vala)
                  & (sub_lowb <= t_df[sub_colb])
                  & (t_df[sub_colb] <= sub_highb)]
    sub_df = sub_df.reset_index(drop=True)

    assert get_dataframe(sub_q1).equals(sub_df)
    assert get_dataframe(sub_q2).equals(sub_df)
Ejemplo n.º 5
0
def test_join_column_names(join_type):
    """Test that join column_names attribute is correct"""
    t = Table("events.calls_20160101")
    joined = t.join(t, on_left="msisdn", how=join_type)
    cols = t.column_names
    cols.remove("msisdn")
    expected = ["msisdn"] + cols + [f"{c}" for c in cols]
    assert expected == joined.column_names
Ejemplo n.º 6
0
def test_pickling():
    """
    Test that we can pickle and unpickle subset classes.
    """
    ss = Table("events.calls").subset(
        "id", ["5wNJA-PdRJ4-jxEdG-yOXpZ", "5wNJA-PdRJ4-jxEdG-yOXpZ"])
    assert ss.get_query() == pickle.loads(pickle.dumps(ss)).get_query()
    assert ss.query_id == pickle.loads(pickle.dumps(ss)).query_id
Ejemplo n.º 7
0
def test_union(get_dataframe):
    """
    Test union with all set to false dedupes.
    """
    q1 = Table(schema="events", name="calls")
    union = q1.union(q1, all=False)
    union_df = get_dataframe(union)
    single_id = union_df[union_df.id == "5wNJA-PdRJ4-jxEdG-yOXpZ"]
    assert len(single_id) == 2
Ejemplo n.º 8
0
def test_union_all(get_dataframe):
    """
    Test default union behaviour keeps duplicates.
    """
    q1 = Table(schema="events", name="calls")
    union_all = q1.union(q1)
    union_all_df = get_dataframe(union_all)
    single_id = union_all_df[union_all_df.id == "5wNJA-PdRJ4-jxEdG-yOXpZ"]
    assert len(single_id) == 4
Ejemplo n.º 9
0
def test_store_with_table():
    """
    Test that a subset of a table can be stored.
    """
    t = Table("events.calls")
    s = t.subset("id", ["5wNJA-PdRJ4-jxEdG-yOXpZ", "5wNJA-PdRJ4-jxEdG-yOXpZ"])
    s.store().result()
    assert s.is_stored
    t.invalidate_db_cache()
    assert not s.is_stored
    assert t.is_stored
Ejemplo n.º 10
0
def test_dependencies():
    """
    Check that a table without explicit columns has no other queries as a dependency,
    and a table with explicit columns has its parent table as a dependency.
    """
    t1 = Table("events.calls")
    assert t1.dependencies == set()

    t2 = Table("events.calls", columns=["id"])
    assert len(t2.dependencies) == 1
    t2_parent = t2.dependencies.pop()
    assert "057addedac04dbeb1dcbbb6b524b43f0" == t2_parent.query_id
Ejemplo n.º 11
0
def test_can_force_rewrite(flowmachine_connect, get_length):
    """
    Test that we can force the rewrite of a test to the database.
    """
    query = EventTableSubset("2016-01-01", "2016-01-01 01:00:00")
    query.to_sql(schema="tests", name="test_rewrite")
    # We're going to delete everything from the table, then
    # force a rewrite, and check that the table now has data.
    sql = """DELETE FROM tests.test_rewrite"""
    flowmachine_connect.engine.execute(sql)
    assert 0 == get_length(Table("tests.test_rewrite"))
    query.to_sql(schema="tests", name="test_rewrite", force=True)
    assert 1 < get_length(Table("tests.test_rewrite"))
Ejemplo n.º 12
0
def test_join_column_names(join_type):
    """Test that join column_names attribute is correct"""
    t = Table("events.calls_20160101", columns=["location_id", "datetime"])
    t2 = Table("infrastructure.cells", columns=["id", "geom_point"])
    joined = t.join(t2, on_left="location_id", on_right="id", how=join_type)

    expected = [
        "location_id" if join_type in ("left", "inner",
                                       "left outer") else "id",
        "datetime",
        "geom_point",
    ]
    assert joined.column_names == expected
Ejemplo n.º 13
0
def subscriber_list_table(subscriber_list, flowmachine_connect):
    engine = flowmachine_connect.engine
    with engine.begin():
        sql = """CREATE TABLE subscriber_list (subscriber TEXT)"""
        engine.execute(sql)

        formatted_subscribers = ",".join("('{}')".format(u)
                                         for u in subscriber_list)
        sql = """INSERT INTO subscriber_list (subscriber) VALUES {}""".format(
            formatted_subscribers)
        engine.execute(sql)
    subs_table = Table("subscriber_list")
    yield subs_table
    subs_table.invalidate_db_cache(drop=True)
Ejemplo n.º 14
0
def test_table_init():
    """
    Test that table creation handles params properly.
    """

    t = Table("events.calls")
    with pytest.raises(ValueError):
        Table("events.calls", "moose")
    with pytest.raises(ValueError):
        Table("events.calls", columns="NO SUCH COLUMN")
    with pytest.raises(ValueError):
        Table("NOSUCHTABLE")
    with pytest.raises(ValueError):
        Table("events.WHAAAAAAAAT")
Ejemplo n.º 15
0
def test_can_force_rewrite(flowmachine_connect, get_length):
    """
    Test that we can force the rewrite of a test to the database.
    """
    query = EventTableSubset(start="2016-01-01", stop="2016-01-01 01:00:00")
    query.to_sql(name="test_rewrite", schema="tests").result()
    # We're going to delete everything from the table, then
    # force a rewrite, and check that the table now has data.
    sql = """DELETE FROM tests.test_rewrite"""
    get_db().engine.execute(sql)
    assert 0 == get_length(Table("tests.test_rewrite"))
    query.invalidate_db_cache(name="test_rewrite", schema="tests")
    query.to_sql(name="test_rewrite", schema="tests").result()
    assert 1 < get_length(Table("tests.test_rewrite"))
def test_subscribers_who_make_atleast_3_calls_in_central_development_region():
    """
    Test that we can find subsets for multiple geometries at same time. Will
    find subscribers who have made at least 2 calls in any of the admin2 regions
    within Central Development admin1 region.
    """
    start, stop = "2016-01-01", "2016-01-07"
    regions = Table("admin2", "geography").subset(
        "admin1name", ["Central Development Region"]
    )

    sls = SubscriberLocationSubset(
        start,
        stop,
        min_calls=2,
        level="polygon",
        column_name="admin2pcod",
        polygon_table=regions,
    )

    df = sls.get_dataframe()

    # we have results for multiple regions
    assert len(df.admin2pcod.unique()) > 1

    # some users should have have made at least 2 calls in more than one region
    # and should therefore appear twice
    assert len(df[df.duplicated("subscriber")]) > 0
Ejemplo n.º 17
0
    def test_cdrs_can_be_subset_by_table(self):
        """
        We can subset CDRs by a table in the database.
        """

        # Create a temporary table in the DB
        con = Table.connection.engine

        sql = "DROP TABLE IF EXISTS subscriber_list"
        con.execute(sql)

        sql = """CREATE TABLE subscriber_list (subscriber TEXT)"""
        con.execute(sql)

        formatted_subscribers = ",".join("('{}')".format(u)
                                         for u in self.subscriber_list)
        sql = """INSERT INTO subscriber_list (subscriber) VALUES {}""".format(
            formatted_subscribers)
        con.execute(sql)
        su = EventTableSubset("2016-01-01",
                              "2016-01-03",
                              subscriber_subset=Table("subscriber_list"))

        df = su.get_dataframe()
        sql = "DROP TABLE IF EXISTS subscriber_list"
        con.execute(sql)
        # Get the set of subscribers present in the dataframe, we need to handle the logic
        # of msisdn_from/msisdn_to
        calculated_subscriber_set = set(df.subscriber)

        self.assertEqual(calculated_subscriber_set, set(self.subscriber_list))
Ejemplo n.º 18
0
def test_subset():
    """
    Test that a subset of a table doesn't show as stored.
    """
    ss = Table("events.calls").subset(
        "id", ["5wNJA-PdRJ4-jxEdG-yOXpZ", "5wNJA-PdRJ4-jxEdG-yOXpZ"])
    assert not ss.is_stored
Ejemplo n.º 19
0
def test_touch_cache_record_for_table(flowmachine_connect):
    """
    Touching a cache record for a table should update access count and last accessed but not touch score, or counter.
    """
    table = Table("events.calls_20160101")
    flowmachine_connect.engine.execute(
        f"UPDATE cache.cached SET compute_time = 1 WHERE query_id=%s",
        table.query_id
    )  # Compute time for tables is zero, so set to 1 to avoid zeroing out
    assert 0 == get_score(flowmachine_connect, table.query_id)
    assert (1 == flowmachine_connect.fetch(
        f"SELECT access_count FROM cache.cached WHERE query_id='{table.query_id}'"
    )[0][0])
    accessed_at = flowmachine_connect.fetch(
        f"SELECT last_accessed FROM cache.cached WHERE query_id='{table.query_id}'"
    )[0][0]
    touch_cache(flowmachine_connect, table.query_id)
    assert 0 == get_score(flowmachine_connect, table.query_id)
    assert (2 == flowmachine_connect.fetch(
        f"SELECT access_count FROM cache.cached WHERE query_id='{table.query_id}'"
    )[0][0])
    # No cache touch should be recorded
    assert (2 == flowmachine_connect.fetch(
        "SELECT nextval('cache.cache_touches');")[0][0])
    assert (accessed_at < flowmachine_connect.fetch(
        f"SELECT last_accessed FROM cache.cached WHERE query_id='{table.query_id}'"
    )[0][0])
Ejemplo n.º 20
0
    def __init__(self, table="sites", date=None):
        """
        Parameters
        ----------
        table: str, default 'sites'
            Which table to collection versioned information from.
            Only the tables infrastructure.sites and infrastructure.cells
            are supported.
        
        date: str, default None
            The date to collect a valid version from. This date
            must be formatted using ISO standards (2016-01-13).
            If no date is passed the current date will be used.
        
        """
        if table not in ("sites", "cells"):
            raise ValueError("Only the tables infrastructure.sites and " +
                             "and infrastructure.cells are supported.")

        if date == None:
            date = datetime.now().strftime("%Y-%m-%d")

        self.table = Table(schema="infrastructure", name=table)
        self.date = date

        super().__init__()
Ejemplo n.º 21
0
def get_sql_for_query_id(query_id):
    """
    Return the SQL which, when run against flowdb, will
    return the result for the query with the given id.

    Parameters
    ----------
    query_id : str
        The query id

    Returns
    -------
    str
    """
    q = Table(f"x{query_id}", "cache")
    sql = q.get_query()
    return sql
Ejemplo n.º 22
0
def test_subset_subset(get_dataframe):
    """
    This test applies two non-numeric subsets one
    after the other .
    """

    sub_cola = "admin1name"
    sub_vala = "Central Development Region"
    sub_colb = "admin2name"
    sub_valb = "Bagmati"
    t = Table("geography.admin3")
    t_df = get_dataframe(t)

    sub_q = t.subset(sub_cola, sub_vala).subset(sub_colb, sub_valb)
    sub_df = t_df[(t_df[sub_cola] == sub_vala) & (t_df[sub_colb] == sub_valb)]
    sub_df = sub_df.reset_index(drop=True)

    assert get_dataframe(sub_q).equals(sub_df)
def test_raises_valueerror_when_grouping_element_not_provided():
    """
    RasterStatistics() raises ValueError when `grouping_element` not provided.
    """
    G = None
    vector = Table(schema="geography", name="admin2")
    with pytest.raises(ValueError):
        r = RasterStatistics(
            "population.small_nepal_raster", vector=vector, grouping_element=None
        )
Ejemplo n.º 24
0
 def test_raster_statistics_column_names_vector(self):
     """
     Test that column_names property matches head(0) for RasterStatistics
     when vector is not None
     """
     vector = Table(schema="public", name="gambia_admin2")
     r = RasterStatistics(raster="worldpop_gambia",
                          vector=vector,
                          grouping_element="district_c")
     assert r.head(0).columns.tolist() == r.column_names
def test_raster_statistics_column_names_vector(get_dataframe):
    """
    Test that column_names property matches head(0) for RasterStatistics
    when vector is not None
    """
    vector = Table(schema="geography", name="admin2")
    r = RasterStatistics(
        raster="population.small_nepal_raster",
        vector=vector,
        grouping_element="admin2pcod",
    )
    assert get_dataframe(r).columns.tolist() == r.column_names
def test_raises_notimplemented_when_wrong_statistic_requested():
    """
    RasterStatistics() raises NotImplementedError if wrong statistic requested.
    """
    vector = Table(schema="geography", name="admin2")
    with pytest.raises(NotImplementedError):
        G = "admin2pcod"
        r = RasterStatistics(
            raster="population.small_nepal_raster",
            vector=vector,
            grouping_element=G,
            statistic="foobar",
        )
def test_computes_expected_clipping_values(get_dataframe):
    """
    RasterStatistics() returns correct values when clipping vector and raster layers.
    """
    G = "admin2pcod"
    vector = Table(schema="geography", name="admin2")
    r = RasterStatistics(
        raster="population.small_nepal_raster", vector=vector, grouping_element=G
    )

    result = get_dataframe(r)  # Should have only _one_ entry
    assert 1 == len(result)
    assert "524 3 07" in result.admin2pcod.values
    assert 2500000.0 == result.set_index("admin2pcod").loc["524 3 07"][0]
Ejemplo n.º 28
0
    def test_query_can_be_subscriber_set_restricted(self):
        """Test that some queries can be limited to only a subset of subscribers."""

        # Create a temporary table in the DB
        con = Table.connection.engine

        sql = "DROP TABLE IF EXISTS subscriber_list"
        con.execute(sql)

        sql = """CREATE TABLE subscriber_list (subscriber TEXT)"""
        con.execute(sql)

        formatted_subscribers = ",".join("('{}')".format(u)
                                         for u in self.subscriber_list)
        sql = """INSERT INTO subscriber_list (subscriber) VALUES {}""".format(
            formatted_subscribers)
        con.execute(sql)
        rog = RadiusOfGyration("2016-01-01",
                               "2016-01-03",
                               subscriber_subset=Table("subscriber_list"))
        hl = HomeLocation(*[
            daily_location(d, subscriber_subset=Table("subscriber_list"))
            for d in list_of_dates("2016-01-01", "2016-01-03")
        ])
        rog_df = rog.get_dataframe()
        hl_df = hl.get_dataframe()
        sql = "DROP TABLE IF EXISTS subscriber_list"
        con.execute(sql)

        # Get the set of subscribers present in the dataframe, we need to handle the logic
        # of msisdn_from/msisdn_to
        calculated_subscriber_set = set(rog_df.subscriber)

        self.assertEqual(calculated_subscriber_set, set(self.subscriber_list))
        calculated_subscriber_set = set(hl_df.subscriber)

        self.assertEqual(calculated_subscriber_set, set(self.subscriber_list))
Ejemplo n.º 29
0
    def test_different_call_days_format(self):
        """
        Test whether we can pass different call days format such as table name, SQL query and CallDays class.
        """
        cd = CallDays("2016-01-01", "2016-01-04", level="versioned-site")
        har = HartiganCluster(cd, 50).get_dataframe()
        self.assertIsInstance(har, pd.DataFrame)

        cd.store().result()

        har = HartiganCluster(Table(cd.table_name), 50).get_dataframe()
        self.assertIsInstance(har, pd.DataFrame)

        cd_query = cd.get_query()
        har = HartiganCluster(CustomQuery(cd_query), 50).get_dataframe()
        self.assertIsInstance(har, pd.DataFrame)
Ejemplo n.º 30
0
    def test_computes_expected_clipping_values(self):
        """
        RasterStatistics() returns correct values when clipping vector and raster layers.
        """
        G = "district_c"
        vector = Table(schema="public", name="gambia_admin2")
        r = RasterStatistics(raster="worldpop_gambia",
                             vector=vector,
                             grouping_element=G)

        result = r.get_dataframe()
        for expected_result in self.rasterio_results.to_dict("records"):
            self.assertAlmostEqual(
                int(result[result[G] ==
                           expected_result["district"]].statistic.iloc[0]),
                expected_result["value"],
            )