def __init__( self, start, stop, hours=(20, 4), *, subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.BOTH, subscriber_subset=None, tables="all", ): self.start = standardise_date(start) self.stop = standardise_date(stop) self.subscriber_identifier = subscriber_identifier self.direction = Direction(direction) self.hours = hours self.tables = tables column_list = [ self.subscriber_identifier, "datetime", *self.direction.required_columns, ] self.unioned_query = EventsTablesUnion( self.start, self.stop, tables=self.tables, columns=column_list, hours="all", subscriber_identifier=subscriber_identifier, subscriber_subset=subscriber_subset, ) super().__init__()
def __init__( self, start, stop, statistic="avg", *, hours="all", tables="all", direction: Union[str, Direction] = Direction.BOTH, subscriber_subset=None, exclude_self_calls=True, ): self.tables = tables self.start = standardise_date(start) self.stop = standardise_date(stop) self.hours = hours self.direction = Direction(direction) self.exclude_self_calls = exclude_self_calls self.statistic = statistic.lower() if self.statistic not in valid_stats: raise ValueError( "{} is not a valid statistic. Use one of {}".format( self.statistic, valid_stats)) column_list = [ "msisdn", "msisdn_counterpart", "id", "location_id", "outgoing" ] self.tables = tables # EventsTablesUnion will only subset on the subscriber identifier, # which means that we need to query for a unioned table twice. That has # a considerable negative impact on execution time. self.unioned_from_query = EventsTablesUnion( self.start, self.stop, columns=column_list, tables=self.tables, subscriber_identifier="msisdn", hours=hours, subscriber_subset=subscriber_subset, ) self.unioned_to_query = EventsTablesUnion( self.start, self.stop, columns=column_list, tables=self.tables, subscriber_identifier="msisdn_counterpart", hours=hours, subscriber_subset=subscriber_subset, ) self.distance_matrix = DistanceMatrix() super().__init__()
def __init__( self, start, stop, *, subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.OUT, statistic="sum", spatial_unit: Optional[AnySpatialUnit] = None, hours="all", subscriber_subset=None, ): self.start = standardise_date(start) self.stop = standardise_date(stop) self.subscriber_identifier = subscriber_identifier self.direction = Direction(direction) if spatial_unit is None: self.spatial_unit = make_spatial_unit("admin", level=3) else: self.spatial_unit = spatial_unit self.statistic = statistic.lower() if self.statistic not in valid_stats: raise ValueError( "{} is not a valid statistic. Use one of {}".format( self.statistic, valid_stats ) ) column_list = [ self.subscriber_identifier, "msisdn_counterpart", "duration", "location_id", "datetime", *self.direction.required_columns, ] self.unioned_query = location_joined_query( EventsTablesUnion( self.start, self.stop, tables="events.calls", columns=column_list, hours=hours, subscriber_subset=subscriber_subset, subscriber_identifier=self.subscriber_identifier, ), spatial_unit=self.spatial_unit, time_col="datetime", ) super().__init__()
def __init__( self, start, stop, statistic="avg", *, spatial_unit: AnySpatialUnit = make_spatial_unit("cell"), hours="all", tables="all", subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.BOTH, subscriber_subset=None, ): self.start = standardise_date(start) self.stop = standardise_date(stop) self.spatial_unit = spatial_unit self.hours = hours self.tables = tables self.subscriber_identifier = subscriber_identifier self.direction = Direction(direction) self.statistic = statistic if self.statistic not in valid_stats: raise ValueError( "{} is not a valid statistic. Use one of {}".format( self.statistic, valid_stats)) column_list = [ self.subscriber_identifier, "location_id", "datetime", *self.direction.required_columns, ] self.unioned_query = location_joined_query( EventsTablesUnion( self.start, self.stop, tables=self.tables, columns=column_list, hours=hours, subscriber_identifier=subscriber_identifier, subscriber_subset=subscriber_subset, ), spatial_unit=self.spatial_unit, time_col="datetime", ) super().__init__()
def __init__( self, start, stop, numerator, *, numerator_direction: Union[str, Direction] = Direction.BOTH, subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.BOTH, hours="all", subscriber_subset=None, tables="all", ): self.start = start self.stop = stop self.subscriber_identifier = subscriber_identifier self.direction = Direction(direction) self.numerator_direction = Direction(numerator_direction) self.hours = hours self.tables = tables self.numerator = numerator if isinstance(numerator, list) else [numerator] self.numerator_query = EventCount( self.start, self.stop, subscriber_identifier=self.subscriber_identifier, direction=self.numerator_direction, hours=self.hours, subscriber_subset=subscriber_subset, tables=self.numerator, ) self.denominator_query = EventCount( self.start, self.stop, subscriber_identifier=self.subscriber_identifier, direction=self.direction, hours=self.hours, subscriber_subset=subscriber_subset, tables=self.tables, ) super().__init__()
def __init__( self, start: str, stop: str, statistic: str = "avg", *, hours: Union[str, Tuple[int, int]] = "all", tables: Union[str, List[str]] = "all", subscriber_identifier: str = "msisdn", subscriber_subset: Optional[Query] = None, direction: Union[str, Direction] = Direction.OUT, ): self.start = standardise_date(start) self.stop = standardise_date(stop) self.hours = hours self.tables = tables self.subscriber_identifier = subscriber_identifier self.direction = Direction(direction) column_list = [ self.subscriber_identifier, "datetime", *self.direction.required_columns, ] self.statistic = statistic.lower() if self.statistic not in valid_stats: raise ValueError( "{} is not a valid statistic. Use one of {}".format( self.statistic, valid_stats)) self.unioned_query = EventsTablesUnion( self.start, self.stop, tables=self.tables, columns=column_list, hours=self.hours, subscriber_identifier=self.subscriber_identifier, subscriber_subset=subscriber_subset, ) super().__init__()
def __init__( self, start, stop, contact_reciprocal, *, direction: Union[str, Direction] = Direction.OUT, subscriber_identifier="msisdn", hours: Optional[Tuple[int, int]] = None, subscriber_subset=None, tables="all", exclude_self_calls=True, ): self.start = start self.stop = stop self.subscriber_identifier = subscriber_identifier self.hours = hours self.exclude_self_calls = exclude_self_calls self.direction = Direction(direction) self.tables = tables column_list = [ self.subscriber_identifier, "msisdn", "msisdn_counterpart", *self.direction.required_columns, ] self.unioned_query = EventsTablesUnion( self.start, self.stop, tables=self.tables, columns=column_list, hours=hours, subscriber_identifier=subscriber_identifier, subscriber_subset=subscriber_subset, ) self.contact_reciprocal_query = contact_reciprocal super().__init__()
def __init__( self, start: str, stop: str, *, table: Union[None, List[str]] = None, spatial_unit: AnySpatialUnit = make_spatial_unit("cell"), interval: str = "hour", direction: Union[str, Direction] = Direction.BOTH, hours: Optional[Tuple[int, int]] = None, subscriber_subset=None, subscriber_identifier="msisdn", ): self.start = standardise_date(start) self.stop = standardise_date(stop) self.table = table self.spatial_unit = spatial_unit self.interval = interval self.direction = Direction(direction) if self.interval not in self.allowed_intervals: raise ValueError("'Interval must be one of: {} got: {}".format( self.allowed_intervals, self.interval)) self.time_cols = ["(datetime::date)::text AS date"] if self.interval == "hour" or self.interval == "min": self.time_cols.append("extract(hour FROM datetime) AS hour") if self.interval == "min": self.time_cols.append("extract(minute FROM datetime) AS min") events_tables_union_cols = [ "location_id", "datetime", subscriber_identifier ] # if we need to filter on outgoing/incoming calls, we will also fetch this # column. Don't fetch it if it is not needed for both efficiency and the # possibility that we might want to do pass another data type which does not # have this information. events_tables_union_cols += self.direction.required_columns self.unioned = location_joined_query( EventsTablesUnion( self.start, self.stop, tables=self.table, columns=events_tables_union_cols, hours=hours, subscriber_subset=subscriber_subset, subscriber_identifier=subscriber_identifier, ), spatial_unit=self.spatial_unit, time_col="datetime", ) super().__init__()
def __init__( self, start, stop, proportion=0.8, *, direction: Union[str, Direction] = Direction.BOTH, tables="all", subscriber_identifier="msisdn", hours="all", exclude_self_calls=False, subscriber_subset=None, ): self.start = start self.stop = stop self.hours = hours self.direction = Direction(direction) self.tables = tables self.subscriber_identifier = subscriber_identifier self.exclude_self_calls = exclude_self_calls if 1 > proportion > 0: self.proportion = proportion else: raise ValueError( "{} is not a valid proportion.".format(proportion)) self.contact_balance = ContactBalance( self.start, self.stop, hours=self.hours, tables=self.tables, subscriber_identifier=self.subscriber_identifier, direction=self.direction, exclude_self_calls=exclude_self_calls, subscriber_subset=subscriber_subset, ) self.subscriber_degree = SubscriberDegree( self.start, self.stop, hours=self.hours, tables=self.tables, subscriber_identifier=self.subscriber_identifier, direction=self.direction, exclude_self_calls=self.exclude_self_calls, subscriber_subset=subscriber_subset, ) self._cols = ["subscriber", "pareto"] super().__init__()
def __init__( self, start, stop, *, hours: Optional[Tuple[int, int]] = None, tables="all", subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.BOTH, exclude_self_calls=True, subscriber_subset=None, ): self.tables = tables self.start = standardise_date(start) self.stop = standardise_date(stop) self.hours = hours self.direction = Direction(direction) self.subscriber_identifier = subscriber_identifier self.exclude_self_calls = exclude_self_calls self.tables = tables column_list = [ self.subscriber_identifier, "msisdn_counterpart", *self.direction.required_columns, ] self.unioned_query = EventsTablesUnion( self.start, self.stop, columns=column_list, tables=self.tables, subscriber_identifier=self.subscriber_identifier, hours=hours, subscriber_subset=subscriber_subset, ) self._cols = [ "subscriber", "msisdn_counterpart", "events", "proportion" ] super().__init__()
def __init__( self, start, stop, *, subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.OUT, statistic="sum", hours="all", subscriber_subset=None, ): self.start = standardise_date(start) self.stop = standardise_date(stop) self.subscriber_identifier = subscriber_identifier self.hours = hours self.direction = Direction(direction) self.statistic = statistic.lower() if self.statistic not in valid_stats: raise ValueError( "{} is not a valid statistic. Use one of {}".format( self.statistic, valid_stats ) ) column_list = [ self.subscriber_identifier, "duration", *self.direction.required_columns, ] self.unioned_query = EventsTablesUnion( self.start, self.stop, tables="events.calls", columns=column_list, hours=hours, subscriber_subset=subscriber_subset, subscriber_identifier=subscriber_identifier, ) super().__init__()
def __init__( self, start, stop, *, hours="all", tables="all", subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.BOTH, exclude_self_calls=True, subscriber_subset=None, ): self.start = start self.stop = stop self.hours = hours self.direction = Direction(direction) self.subscriber_identifier = subscriber_identifier self.exclude_self_calls = exclude_self_calls self.tables = tables column_list = [ self.subscriber_identifier, "msisdn_counterpart", *self.direction.required_columns, ] self.unioned_query = EventsTablesUnion( self.start, self.stop, hours=self.hours, tables=self.tables, columns=column_list, subscriber_identifier=self.subscriber_identifier, subscriber_subset=subscriber_subset, ) self._cols = ["subscriber", "degree"] super().__init__()
class PerLocationSubscriberCallDurations(SubscriberFeature): """ This class returns the total amount of time a subscriber spent calling within the period, optionally limited to only calls they made, or received, faceted by their location at the time. Parameters ---------- start, stop : str iso-format start and stop datetimes hours : 2-tuple of floats, default 'all' Restrict the analysis to only a certain set of hours within each day. subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. direction : {'in', 'out', 'both'} or Direction, default Direction.OUT Whether to consider calls made, received, or both. Defaults to 'out'. spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin3 Spatial unit to which subscriber locations will be mapped. See the docstring of make_spatial_unit for more information. statistic : {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'sum' Defaults to sum, aggregation statistic over the durations. Examples -------- >>> s = PerLocationSubscriberCallDurations("2016-01-01", "2016-01-07", direction="in") >>> s.get_dataframe() subscriber name value 0 038OVABN11Ak4W5P Baglung 1979.0 1 038OVABN11Ak4W5P Banke 2204.0 2 038OVABN11Ak4W5P Dolpa 9169.0 ... """ def __init__( self, start, stop, *, subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.OUT, statistic="sum", spatial_unit: Optional[AnySpatialUnit] = None, hours="all", subscriber_subset=None, ): self.start = standardise_date(start) self.stop = standardise_date(stop) self.subscriber_identifier = subscriber_identifier self.direction = Direction(direction) if spatial_unit is None: self.spatial_unit = make_spatial_unit("admin", level=3) else: self.spatial_unit = spatial_unit self.statistic = statistic.lower() if self.statistic not in valid_stats: raise ValueError( "{} is not a valid statistic. Use one of {}".format( self.statistic, valid_stats ) ) column_list = [ self.subscriber_identifier, "msisdn_counterpart", "duration", "location_id", "datetime", *self.direction.required_columns, ] self.unioned_query = location_joined_query( EventsTablesUnion( self.start, self.stop, tables="events.calls", columns=column_list, hours=hours, subscriber_subset=subscriber_subset, subscriber_identifier=self.subscriber_identifier, ), spatial_unit=self.spatial_unit, time_col="datetime", ) super().__init__() @property def column_names(self) -> List[str]: return ["subscriber"] + self.spatial_unit.location_id_columns + ["value"] def _make_query(self): loc_cols = ", ".join(self.spatial_unit.location_id_columns) where_clause = make_where(self.direction.get_filter_clause()) return f"""
class PerLocationEventStats(SubscriberFeature): """ This class returns the statistics of event count per location per subscriber within the period, optionally limited to only incoming or outgoing events. For instance, it calculates the average number of events per cell per subscriber. Parameters ---------- start, stop : str iso-format start and stop datetimes statistic : {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'avg' Defaults to avg, aggregation statistic over the durations. hours : 2-tuple of floats, default 'all' Restrict the analysis to only a certain set of hours within each day. subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. direction : {'in', 'out', 'both'} or Direction, default Direction.BOTH Whether to consider calls made, received, or both. Defaults to 'both'. tables : str or list of strings, default 'all' Can be a string of a single table (with the schema) or a list of these. The keyword all is to select all subscriber tables spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell Spatial unit to which subscriber locations will be mapped. See the docstring of make_spatial_unit for more information. Examples -------- >>> s = PerLocationEventStats("2016-01-01", "2016-01-07") >>> s.get_dataframe() subscriber value OemQ7q2DLZMWnwzB 1.388889 By4j6PKdB4NGMpxr 1.421053 L4V537alj321eWz6 1.130435 4pQo67v0PWyLdYKO 1.400000 8br1gO32xWXxjY0R 1.100000 ... ... """ def __init__( self, start, stop, statistic="avg", *, spatial_unit: AnySpatialUnit = make_spatial_unit("cell"), hours="all", tables="all", subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.BOTH, subscriber_subset=None, ): self.start = standardise_date(start) self.stop = standardise_date(stop) self.spatial_unit = spatial_unit self.hours = hours self.tables = tables self.subscriber_identifier = subscriber_identifier self.direction = Direction(direction) self.statistic = statistic if self.statistic not in valid_stats: raise ValueError( "{} is not a valid statistic. Use one of {}".format( self.statistic, valid_stats)) column_list = [ self.subscriber_identifier, "location_id", "datetime", *self.direction.required_columns, ] self.unioned_query = location_joined_query( EventsTablesUnion( self.start, self.stop, tables=self.tables, columns=column_list, hours=hours, subscriber_identifier=subscriber_identifier, subscriber_subset=subscriber_subset, ), spatial_unit=self.spatial_unit, time_col="datetime", ) super().__init__() @property def column_names(self): return ["subscriber", "value"] def _make_query(self): loc_cols = ", ".join(self.spatial_unit.location_id_columns) where_clause = make_where(self.direction.get_filter_clause()) return f"""
class SubscriberCallDurations(SubscriberFeature): """ This class returns the total amount of time a subscriber spent calling within the period, optionally limited to only calls they made, or received. Parameters ---------- start, stop : str iso-format start and stop datetimes hours : 2-tuple of floats, default 'all' Restrict the analysis to only a certain set of hours within each day. subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. direction : {'in', 'out', 'both'} or Direction, default Direction.OUT Whether to consider calls made, received, or both. Defaults to 'out'. statistic : {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'sum' Defaults to sum, aggregation statistic over the durations. Examples -------- >>> s = SubscriberCallDurations("2016-01-01", "2016-01-07", direction="in") >>> s.get_dataframe() msisdn value 0 jWlyLwbGdvKV35Mm 4038.0 1 EreGoBpxJOBNl392 12210.0 2 nvKNoAmxMvBW4kJr 10847.0 3 VkzMxYjv7mYn53oK 48374.0 4 BKMy1nYEZpnoEA7G 8697.0 ... """ def __init__( self, start, stop, *, subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.OUT, statistic="sum", hours="all", subscriber_subset=None, ): self.start = standardise_date(start) self.stop = standardise_date(stop) self.subscriber_identifier = subscriber_identifier self.hours = hours self.direction = Direction(direction) self.statistic = statistic.lower() if self.statistic not in valid_stats: raise ValueError( "{} is not a valid statistic. Use one of {}".format( self.statistic, valid_stats ) ) column_list = [ self.subscriber_identifier, "duration", *self.direction.required_columns, ] self.unioned_query = EventsTablesUnion( self.start, self.stop, tables="events.calls", columns=column_list, hours=hours, subscriber_subset=subscriber_subset, subscriber_identifier=subscriber_identifier, ) super().__init__() @property def column_names(self) -> List[str]: return ["subscriber", "value"] def _make_query(self): where_clause = make_where(self.direction.get_filter_clause()) return f"""
class NocturnalEvents(SubscriberFeature): """ Represents the percentage of events that a subscriber make/receives which began at night. The definition of night is configurable. Parameters ---------- start, stop : str iso-format start and stop datetimes hours : tuple of ints, default (20, 4) Hours that count as being nocturnal. e.g. (20,4) will be the times after 8pm and before 4 am. subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. direction : {'in', 'out', 'both'} or Direction, default Direction.BOTH Whether to consider calls made, received, or both. Defaults to 'both'. tables : str or list of strings, default 'all' Can be a string of a single table (with the schema) or a list of these. The keyword all is to select all subscriber tables Examples -------- >>> s = NocturnalEvents("2016-01-01", "2016-01-02") >>> s.get_dataframe() subscriber value 2ZdMowMXoyMByY07 0.000000 MobnrVMDK24wPRzB 40.000000 0Ze1l70j0LNgyY4w 16.666667 Nnlqka1oevEMvVrm 33.333333 4dqenN2oQZExwEK2 83.333333 ... ... """ def __init__( self, start, stop, hours=(20, 4), *, subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.BOTH, subscriber_subset=None, tables="all", ): self.start = standardise_date(start) self.stop = standardise_date(stop) self.subscriber_identifier = subscriber_identifier self.direction = Direction(direction) self.hours = hours self.tables = tables column_list = [ self.subscriber_identifier, "datetime", *self.direction.required_columns, ] self.unioned_query = EventsTablesUnion( self.start, self.stop, tables=self.tables, columns=column_list, hours="all", subscriber_identifier=subscriber_identifier, subscriber_subset=subscriber_subset, ) super().__init__() @property def column_names(self): return ["subscriber", "value"] def _make_query(self): where_clause = make_where(self.direction.get_filter_clause()) sql = f""" SELECT subscriber, AVG(nocturnal)*100 AS value FROM ( SELECT subscriber, CASE WHEN extract(hour FROM datetime) >= {self.hours[0]} OR extract(hour FROM datetime) < {self.hours[1]} THEN 1 ELSE 0 END AS nocturnal FROM ({self.unioned_query.get_query()}) U {where_clause} ) U GROUP BY subscriber """ return sql
def __init__( self, start, stop, phase="hour", *, subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.BOTH, hours: Optional[Tuple[int, int]] = None, subscriber_subset=None, tables="all", ): self.tables = tables self.start = standardise_date(start) self.stop = standardise_date(stop) self.subscriber_identifier = subscriber_identifier self.direction = Direction(direction) self.hours = hours column_list = [ self.subscriber_identifier, "datetime", *self.direction.required_columns, ] # extracted from the POSTGRES manual allowed_phases = ( "century", "day", "decade", "dow", "doy", "epoch", "hour", "isodow", "isoyear", "microseconds", "millennium", "milliseconds", "minute", "month", "quarter", "second", "week", "year", ) if phase not in allowed_phases: raise ValueError( f"{phase} is not a valid phase. Choose one of {allowed_phases}" ) self.phase = phase self.unioned_query = EventsTablesUnion( self.start, self.stop, tables=self.tables, columns=column_list, hours=hours, subscriber_identifier=subscriber_identifier, subscriber_subset=subscriber_subset, ) super().__init__()
class DistanceCounterparts(SubscriberFeature): """ This class returns metrics related with the distance between event initiator and her/his counterparts. It assumes that the ID column uniquely identifies the event initiator and their counterparts' event. Choose only tables for which this assumption is true. In some cases, asynchronous communication like SMS might not be tagged with an ID that allows one to recover the counterpart event. Distances are measured in km. Parameters ---------- start, stop : str iso-format start and stop datetimes hours : 2-tuple of floats, default 'all' Restrict the analysis to only a certain set of hours within each day. tables: str, default 'all'. The table must have a `msisdn_counterpart` column. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. statistic : {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'avg' exclude_self_calls : bool, default True Set to false to *include* calls a subscriber made to themself Defaults to sum, aggregation statistic over the durations. Examples -------- >>> s = DistanceCounterparts("2016-01-01", "2016-01-07", statistic="avg") >>> s.get_dataframe() subscriber distance_avg 038OVABN11Ak4W5P 272.167815 09NrjaNNvDanD8pk 241.290233 0ayZGYEQrqYlKw6g 218.161568 0DB8zw67E9mZAPK2 228.235324 0Gl95NRLjW2aw8pW 189.008980 ... ... """ def __init__( self, start, stop, statistic="avg", *, hours="all", tables="all", direction: Union[str, Direction] = Direction.BOTH, subscriber_subset=None, exclude_self_calls=True, ): self.tables = tables self.start = standardise_date(start) self.stop = standardise_date(stop) self.hours = hours self.direction = Direction(direction) self.exclude_self_calls = exclude_self_calls self.statistic = statistic.lower() if self.statistic not in valid_stats: raise ValueError( "{} is not a valid statistic. Use one of {}".format( self.statistic, valid_stats)) column_list = [ "msisdn", "msisdn_counterpart", "id", "location_id", "outgoing" ] self.tables = tables # EventsTablesUnion will only subset on the subscriber identifier, # which means that we need to query for a unioned table twice. That has # a considerable negative impact on execution time. self.unioned_from_query = EventsTablesUnion( self.start, self.stop, columns=column_list, tables=self.tables, subscriber_identifier="msisdn", hours=hours, subscriber_subset=subscriber_subset, ) self.unioned_to_query = EventsTablesUnion( self.start, self.stop, columns=column_list, tables=self.tables, subscriber_identifier="msisdn_counterpart", hours=hours, subscriber_subset=subscriber_subset, ) self.distance_matrix = DistanceMatrix() super().__init__() @property def column_names(self) -> List[str]: return ["subscriber", "value"] def _make_query(self): filters = [self.direction.get_filter_clause("A")] if self.exclude_self_calls: filters.append("A.subscriber != A.msisdn_counterpart") on_filters = make_where(filters) sql = f""" SELECT U.subscriber AS subscriber, {self.statistic}(D.value) AS value FROM ( SELECT A.subscriber, A.location_id AS location_id_from, B.location_id AS location_id_to FROM ({self.unioned_from_query.get_query()}) AS A JOIN ({self.unioned_to_query.get_query()}) AS B ON A.id = B.id AND A.outgoing != B.outgoing {on_filters} ) U JOIN ({self.distance_matrix.get_query()}) D USING (location_id_from, location_id_to) GROUP BY U.subscriber """ return sql
class IntereventInterval(SubscriberFeature): """ This class calculates intervent period statistics such as the average and standard deviation of the duration between calls and returns them as time intervals. Parameters ---------- start, stop : str iso-format start and stop datetimes hours : 2-tuple of floats, default 'all' Restrict the analysis to only a certain set of hours within each day. time_resolution : str Temporal resolution to return results at, e.g. 'hour' for fractional hours. subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. direction : {'in', 'out', 'both'} or Direction, default Direction.OUT Whether to consider calls made, received, or both. Defaults to 'out'. tables : str or list of strings, default 'all' Can be a string of a single table (with the schema) or a list of these. The keyword all is to select all subscriber tables statistic : {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'avg' Defaults to sum, aggregation statistic over the durations. Examples -------- >>> s = IntereventInterval("2016-01-01", "2016-01-07") >>> s.get_dataframe() subscriber value 0 038OVABN11Ak4W5P 04:57:22.428571 1 09NrjaNNvDanD8pk 03:52:38.454545 2 0ayZGYEQrqYlKw6g 04:02:05.666667 3 0DB8zw67E9mZAPK2 06:32:30.714285 4 0Gl95NRLjW2aw8pW 05:44:20.625000 .. ... ... 495 ZQG8glazmxYa1K62 04:12:27.705882 496 Zv4W9eak2QN1M5A7 03:41:10.323529 497 zvaOknzKbEVD2eME 04:21:27.218750 498 Zy3DkbY7MDd6Er7l 04:33:00.870968 499 ZYPxqVGLzlQy6l7n 04:01:28.212121 [500 rows x 2 columns] """ def __init__( self, start: str, stop: str, statistic: str = "avg", *, hours: Union[str, Tuple[int, int]] = "all", tables: Union[str, List[str]] = "all", subscriber_identifier: str = "msisdn", subscriber_subset: Optional[Query] = None, direction: Union[str, Direction] = Direction.OUT, ): self.start = standardise_date(start) self.stop = standardise_date(stop) self.hours = hours self.tables = tables self.subscriber_identifier = subscriber_identifier self.direction = Direction(direction) column_list = [ self.subscriber_identifier, "datetime", *self.direction.required_columns, ] self.statistic = statistic.lower() if self.statistic not in valid_stats: raise ValueError( "{} is not a valid statistic. Use one of {}".format( self.statistic, valid_stats)) self.unioned_query = EventsTablesUnion( self.start, self.stop, tables=self.tables, columns=column_list, hours=self.hours, subscriber_identifier=self.subscriber_identifier, subscriber_subset=subscriber_subset, ) super().__init__() @property def column_names(self): return ["subscriber", "value"] def _make_query(self): where_clause = make_where(self.direction.get_filter_clause()) # Postgres does not support the following three operations with intervals if self.statistic in {"median", "stddev", "variance"}: statistic_clause = ( f"MAKE_INTERVAL(secs => {self.statistic}(EXTRACT(EPOCH FROM delta)))" ) else: statistic_clause = f"{self.statistic}(delta)" sql = f""" SELECT subscriber, {statistic_clause} AS value FROM ( SELECT subscriber, datetime - LAG(datetime, 1, NULL) OVER (PARTITION BY subscriber ORDER BY datetime) AS delta FROM ({self.unioned_query.get_query()}) AS U {where_clause} ) AS U GROUP BY subscriber """ return sql
class ProportionEventReciprocal(SubscriberFeature): """ This class calculates the proportion of events with a reciprocal contact per subscriber. It is possible to fine-tune the period for which a reciprocal contact must have happened. A reciprocal contact is a contact who has initiated contact with the subscriber and who also has been the counterpart of an initatiated contact by the subscriber. Parameters ---------- start, stop : str iso-format start and stop datetimes hours : 2-tuple of floats, default 'all' Restrict the analysis to only a certain set of hours within each day. contact_reciprocal: flowmachine.features.ContactReciprocal An instance of `ContactReciprocal` listing which contacts are reciprocal and which are not. subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. direction : {'in', 'out', 'both'} or Direction, default Direction.OUT Whether to consider calls made, received, or both. Defaults to 'out'. exclude_self_calls : bool, default True Set to false to *include* calls a subscriber made to themself tables : str or list of strings, default 'all' Can be a string of a single table (with the schema) or a list of these. The keyword all is to select all subscriber tables Example ------- >> s = ProportionEventReciprocal('2016-01-01', '2016-01-08', ContactReciprocal('2016-01-01', '2016-01-08')) >> s.get_dataframe() subscriber value 9vXy462Ej8V1kpWl 0.0 Q4mwVxpBOo7X2lb9 0.0 5jLW0EWeoyg6NQo3 0.0 QEoRM9vlkV18N4ZY 0.0 a76Ajyb9dmEYNd8L 0.0 ... ... """ def __init__( self, start, stop, contact_reciprocal, *, direction: Union[str, Direction] = Direction.OUT, subscriber_identifier="msisdn", hours: Optional[Tuple[int, int]] = None, subscriber_subset=None, tables="all", exclude_self_calls=True, ): self.start = start self.stop = stop self.subscriber_identifier = subscriber_identifier self.hours = hours self.exclude_self_calls = exclude_self_calls self.direction = Direction(direction) self.tables = tables column_list = [ self.subscriber_identifier, "msisdn", "msisdn_counterpart", *self.direction.required_columns, ] self.unioned_query = EventsTablesUnion( self.start, self.stop, tables=self.tables, columns=column_list, hours=hours, subscriber_identifier=subscriber_identifier, subscriber_subset=subscriber_subset, ) self.contact_reciprocal_query = contact_reciprocal super().__init__() @property def column_names(self): return ["subscriber", "value"] def _make_query(self): filters = [self.direction.get_filter_clause()] if self.exclude_self_calls: filters.append("subscriber != msisdn_counterpart") where_clause = make_where(filters) on_clause = f""" ON {'U.subscriber' if self.subscriber_identifier == 'msisdn' else 'U.msisdn'} = R.subscriber AND U.msisdn_counterpart = R.msisdn_counterpart """ sql = f""" SELECT subscriber, AVG(reciprocal::int) AS value FROM ( SELECT U.subscriber, COALESCE(reciprocal, FALSE) AS reciprocal FROM ( SELECT * FROM ({self.unioned_query.get_query()}) U {where_clause} ) U LEFT JOIN ( SELECT subscriber, msisdn_counterpart, reciprocal FROM ({self.contact_reciprocal_query.get_query()}) R ) R {on_clause} ) R GROUP BY subscriber """ return sql
class SubscriberDegree(SubscriberFeature): """ Find the total number of unique contacts that each subscriber interacts with. Parameters ---------- start, stop : str iso-format start and stop datetimes hours : 2-tuple of floats, default 'all' Restrict the analysis to only a certain set of hours within each day. tables : str, default 'all' direction : {'in', 'out', 'both'} or Direction, default Direction.BOTH Whether to consider calls made, received, or both. Defaults to 'both'. subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. kwargs Passed to flowmachine.EventTableUnion Notes ----- `subscriber_identifier` refers only to the subject of the analysis so for example subscriber_identifier='imei' will find all the unique msisdns that each imei calls. There is currently no way to specify the unique number of imei that each subscriber calls for instance. Examples -------- >>> SubscriberDegree('2016-01-01', '2016-01-01') msisdn value 0 038OVABN11Ak4W5P 2 1 09NrjaNNvDanD8pk 2 2 0ayZGYEQrqYlKw6g 2 3 0DB8zw67E9mZAPK2 2 4 0Gl95NRLjW2aw8pW 2 5 0gmvwzMAYbz5We1E 2 ... """ def __init__( self, start, stop, *, hours="all", tables="all", subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.BOTH, exclude_self_calls=True, subscriber_subset=None, ): self.start = start self.stop = stop self.hours = hours self.direction = Direction(direction) self.subscriber_identifier = subscriber_identifier self.exclude_self_calls = exclude_self_calls self.tables = tables column_list = [ self.subscriber_identifier, "msisdn_counterpart", *self.direction.required_columns, ] self.unioned_query = EventsTablesUnion( self.start, self.stop, hours=self.hours, tables=self.tables, columns=column_list, subscriber_identifier=self.subscriber_identifier, subscriber_subset=subscriber_subset, ) self._cols = ["subscriber", "degree"] super().__init__() @property def column_names(self) -> List[str]: return ["subscriber", "value"] def _make_query(self): filters = [self.direction.get_filter_clause()] if self.exclude_self_calls: filters.append("subscriber != msisdn_counterpart") where_clause = make_where(filters) sql = f""" SELECT subscriber, COUNT(*) AS value FROM ( SELECT DISTINCT subscriber, msisdn_counterpart FROM ({self.unioned_query.get_query()}) AS U {where_clause} ) AS U GROUP BY subscriber """ return sql
class ContactBalance(GraphMixin, SubscriberFeature): """ This class calculates the total number of events that a subscriber interacts with a counterpart, and the proportion of events that a given contact participates out of the subscriber's total event count. This can be used to calculate a subscriber's contact network graph and the respective weighted edges for each contact. Parameters ---------- start, stop : str iso-format start and stop datetimes hours : 2-tuple of floats, default 'all' Restrict the analysis to only a certain set of hours within each day. tables : str, default 'all' exclude_self_calls : bool, default True Set to false to *include* calls a subscriber made to themself subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn' Either msisdn, or imei, the column that identifies the subscriber. subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None If provided, string or list of string which are msisdn or imeis to limit results to; or, a query or table which has a column with a name matching subscriber_identifier (typically, msisdn), to limit results to. direction : {'in', 'out', 'both'} or Direction, default Direction.BOTH Event direction to include in computation. This can be outgoing ('out'), incoming ('in'), or both ('both'). Examples -------- >>> ContactBalance('2016-01-01', '2016-01-07') msisdn msisdn_counterpart events proportion 0 038OVABN11Ak4W5P 09NrjaNNvDanD8pk 110 0.54 1 09NrjaNNvDanD8pk 0ayZGYEQrqYlKw6g 94 0.44 2 0ayZGYEQrqYlKw6g 0DB8zw67E9mZAPK2 70 0.23 3 0DB8zw67E9mZAPK2 0DB8zw67E9mZAXFF 20 0.12 ... """ def __init__( self, start, stop, *, hours: Optional[Tuple[int, int]] = None, tables="all", subscriber_identifier="msisdn", direction: Union[str, Direction] = Direction.BOTH, exclude_self_calls=True, subscriber_subset=None, ): self.tables = tables self.start = standardise_date(start) self.stop = standardise_date(stop) self.hours = hours self.direction = Direction(direction) self.subscriber_identifier = subscriber_identifier self.exclude_self_calls = exclude_self_calls self.tables = tables column_list = [ self.subscriber_identifier, "msisdn_counterpart", *self.direction.required_columns, ] self.unioned_query = EventsTablesUnion( self.start, self.stop, columns=column_list, tables=self.tables, subscriber_identifier=self.subscriber_identifier, hours=hours, subscriber_subset=subscriber_subset, ) self._cols = [ "subscriber", "msisdn_counterpart", "events", "proportion" ] super().__init__() @property def column_names(self) -> List[str]: return ["subscriber", "msisdn_counterpart", "events", "proportion"] def _make_query(self): filters = [self.direction.get_filter_clause()] if (self.subscriber_identifier in {"msisdn" }) and (self.exclude_self_calls): filters.append("subscriber != msisdn_counterpart") where_clause = make_where(filters) sql = f""" WITH unioned AS ( SELECT * FROM ({self.unioned_query.get_query()}) as U {where_clause} ), total_events AS ( SELECT subscriber, count(*) AS events FROM unioned GROUP BY subscriber ) SELECT U.subscriber, U.msisdn_counterpart, count(*) as events, (count(*)::float / T.events::float) as proportion FROM (SELECT U.subscriber, U.msisdn_counterpart FROM unioned as U) AS U JOIN total_events AS T ON U.subscriber = T.subscriber GROUP BY U.subscriber, U.msisdn_counterpart, T.events ORDER BY proportion DESC """ return sql def counterparts_subset(self, include_subscribers=False): """ Returns the subset of counterparts. In some cases, we are interested in obtaining information about the subset of subscribers contacts. This method also allows one to get the subset of counterparts together with subscribers by turning the `include_subscribers` flag to `True`. Parameters ---------- include_subscribers: bool, default True Wether to include the list of subscribers in the subset as well. """ return _ContactBalanceSubset(contact_balance=self, include_subscribers=include_subscribers)