def __init__( self, start=None, stop=None, *, hours="all", table="events.calls", subscriber_subset=None, columns=["*"], subscriber_identifier="msisdn", ): # Temporary band-aid; marshmallow deserialises date strings # to date objects, so we convert it back here because the # lower-level classes still assume we are passing date strings. if isinstance(start, datetime.date): start = start.strftime("%Y-%m-%d") if isinstance(stop, datetime.date): stop = stop.strftime("%Y-%m-%d") self.start = start self.stop = stop self.hours = hours self.subscriber_subset_ORIG = subscriber_subset self.subscriber_subsetter = make_subscriber_subsetter( subscriber_subset) self.subscriber_identifier = subscriber_identifier.lower() if columns == ["*"]: self.table_ORIG = Table(table) columns = self.table_ORIG.column_names else: self.table_ORIG = Table(table, columns=columns) self.columns = set(columns) try: self.columns.remove(subscriber_identifier) self.columns.add(f"{subscriber_identifier} AS subscriber") except KeyError: if self.subscriber_subsetter.is_proper_subset: warnings.warn( f"No subscriber column requested, did you mean to include {subscriber_identifier} in columns? " "Since you passed a subscriber_subset the data will still be subset by your subscriber subset, " "but the subscriber column will not be present in the output.", stacklevel=2, ) self.columns = sorted(self.columns) self.sqlalchemy_table = get_sqlalchemy_table_definition( self.table_ORIG.fully_qualified_table_name, engine=Query.connection.engine) if self.start == self.stop: raise ValueError("Start and stop are the same.") super().__init__() # This needs to happen after the parent classes init method has been # called as it relies upon the connection object existing self._check_dates()
def test_print_dependency_tree(): """ Test that the expected dependency tree is printed for a daily location query (with an explicit subset). """ subscriber_subsetter = make_subscriber_subsetter( CustomQuery( "SELECT duration, msisdn as subscriber FROM events.calls WHERE duration < 10", ["duration", "subscriber"], )) q = daily_location(date="2016-01-02", method="most-common", subscriber_subset=subscriber_subsetter) expected_output = textwrap.dedent("""\ <Query of type: MostFrequentLocation, query_id: 'xxxxx'> - <Query of type: PolygonSpatialUnit, query_id: 'xxxxx'> - <Table: 'geography.admin3', query_id: 'xxxxx'> - <Query of type: SubscriberLocations, query_id: 'xxxxx'> - <Query of type: JoinToLocation, query_id: 'xxxxx'> - <Query of type: PolygonSpatialUnit, query_id: 'xxxxx'> - <Table: 'geography.admin3', query_id: 'xxxxx'> - <Query of type: EventsTablesUnion, query_id: 'xxxxx'> - <Query of type: EventTableSubset, query_id: 'xxxxx'> - <Query of type: CustomQuery, query_id: 'xxxxx'> - <Table: 'events.sms', query_id: 'xxxxx'> - <Table: 'events.sms', query_id: 'xxxxx'> - <Query of type: EventTableSubset, query_id: 'xxxxx'> - <Query of type: CustomQuery, query_id: 'xxxxx'> - <Table: 'events.calls', query_id: 'xxxxx'> - <Table: 'events.calls', query_id: 'xxxxx'> - <Query of type: PolygonSpatialUnit, query_id: 'xxxxx'> - <Table: 'geography.admin3', query_id: 'xxxxx'> """) s = StringIO() print_dependency_tree(q, stream=s) output = s.getvalue() output_with_query_ids_replaced = re.sub(r"\b[0-9a-f]+\b", "xxxxx", output) assert expected_output == output_with_query_ids_replaced
def __init__( self, *, start=None, stop=None, hours="all", hour_slices=None, table="events.calls", subscriber_subset=None, columns=["*"], subscriber_identifier="msisdn", ): if hours != "all" and hour_slices is not None: raise ValueError( "The arguments `hours` and `hour_slice` are mutually exclusive." ) if hours != "all": assert (isinstance(hours, tuple) and len(hours) == 2 and isinstance(hours[0], int) and isinstance(hours[1], int)) # sanity check start_hour = hours[0] stop_hour = hours[1] start_hour_str = f"{start_hour:02d}:00" stop_hour_str = f"{stop_hour:02d}:00" if start_hour <= stop_hour: hs = HourInterval(start_hour=start_hour_str, stop_hour=stop_hour_str, freq="day") self.hour_slices = HourSlice(hour_intervals=[hs]) else: # If hours are backwards, then this is interpreted as spanning midnight, # so we split it into two time slices for the beginning/end of the day. hs1 = HourInterval(start_hour=None, stop_hour=stop_hour_str, freq="day") hs2 = HourInterval(start_hour=start_hour_str, stop_hour=None, freq="day") self.hour_slices = HourSlice(hour_intervals=[hs1, hs2]) else: self.hour_slices = HourSlice(hour_intervals=[]) self.start = standardise_date(start) self.stop = standardise_date(stop) self.hours = hours self.subscriber_subsetter = make_subscriber_subsetter( subscriber_subset) self.subscriber_identifier = subscriber_identifier.lower() if columns == ["*"]: self.table_ORIG = Table(table) columns = self.table_ORIG.column_names else: self.table_ORIG = Table(table, columns=columns) self.columns = set(columns) try: self.columns.remove(subscriber_identifier) self.columns.add(f"{subscriber_identifier} AS subscriber") except KeyError: if self.subscriber_subsetter.is_proper_subset: warnings.warn( f"No subscriber column requested, did you mean to include {subscriber_identifier} in columns? " "Since you passed a subscriber_subset the data will still be subset by your subscriber subset, " "but the subscriber column will not be present in the output.", stacklevel=2, ) self.columns = sorted(self.columns) self.sqlalchemy_table = get_sqlalchemy_table_definition( self.table_ORIG.fully_qualified_table_name, engine=get_db().engine, ) if self.start == self.stop: raise ValueError("Start and stop are the same.") super().__init__() # This needs to happen after the parent classes init method has been # called as it relies upon the connection object existing self._check_dates()