def _process_pipeline(self, read_stream): stream = read_stream.withColumn("viewer_id", col("header").getItem("viewerID")) distinct_count_viewer_id = stream \ .aggregate(DistinctCount(aggregation_field="viewer_id", aggregation_name=self._component_name, aggregation_window=self._get_interval_duration("uniqCountWindow"))) distinct_count_per_software_version_stream = stream \ .withColumn("model_name", col("header").getItem("modelName")) \ .aggregate(DistinctCount(group_fields=["model_name"], aggregation_field="viewer_id", aggregation_name=self._component_name)) distinct_count_per_hardware_version_stream = stream \ .withColumn("hardware_version", col("header").getItem("hardwareVersion")) \ .aggregate(DistinctCount(group_fields=["hardware_version"], aggregation_field="viewer_id", aggregation_name=self._component_name)) distinct_count_per_model_stream = stream \ .withColumn("software_versions", explode(col("header").getItem("softwareVersions"))) \ .withColumn("software_version", col("software_versions").getItem("version")) \ .aggregate(DistinctCount(group_fields=["software_version"], aggregation_field="viewer_id", aggregation_name=self._component_name)) return [distinct_count_per_software_version_stream, distinct_count_per_hardware_version_stream, distinct_count_per_model_stream, distinct_count_viewer_id]
def distinct_total_cpe_with_audio_dolby_digital_not_accepted_app_user_agreement( self, common_settings_pipeline): return common_settings_pipeline \ .filter(col("`customer.appsOptIn`") == 'false') \ .aggregate(DistinctCount(aggregation_field="viewer_id", aggregation_name=self._component_name + ".cpe_with_not_accepted_app_user_agreement"))
def _process_pipeline(self, stream): usage_stream = stream \ .select("@timestamp", "UsageCollectorReport.*", col("header.viewerID").alias("viewer_id")) \ .filter(col("UsageCollectorReport.retries") >= 1) \ .aggregate(DistinctCount(aggregation_field="viewer_id", aggregation_name=self._component_name + ".with_retries")) return usage_stream
def __distinct_event_type_by_channel(self, read_stream): return read_stream \ .where("event_type = 'TUNE_IN'") \ .aggregate(DistinctCount(group_fields=["channel"], aggregation_field="viewer_id", aggregation_name=self._component_name + ".tune_in", aggregation_window=self._get_interval_duration("uniqCountWindow")))
def __viewer_id_distinct_count_per_loading_screens( self, common_screen_load_pipeline): return common_screen_load_pipeline \ .where("loading_screen is not NULL") \ .aggregate(DistinctCount(aggregation_field="viewer_id", group_fields=["loading_screen"], aggregation_name=self._component_name, aggregation_window=self._get_interval_duration("uniqCountWindow")))
def distinct_cpe_with_age_restriction_enabled(self, common_settings_pipeline): return common_settings_pipeline \ .where("`profile.ageLock` is not NULL") \ .withColumn("profile_age_lock", col("`profile.ageLock`"), ) \ .aggregate(DistinctCount(aggregation_field="viewer_id", group_fields=["profile_age_lock"], aggregation_name=self._component_name + ".cpe_with_age_restriction"))
def __restarted_stbs_total_count(self, common_vm_stat_pipeline): return common_vm_stat_pipeline \ .select("@timestamp", "uptime", "viewer_id") \ .where((col("uptime") >= 0) & (col("uptime") <= 100)) \ .aggregate(DistinctCount(aggregation_field="viewer_id", aggregation_name=self._component_name + ".restarted_stbs", aggregation_window=self._get_interval_duration("uniqCountWindow")))
def __dag_details(self, read_stream): """ :param read_stream: input stream with events from dag kafka topic :return: list of aggregated metrics """ details = ".details" number_of_unique_tasks_in_the_dags = read_stream \ .filter("dag is not NULL") \ .filter("task is not NULL") \ .aggregate(DistinctCount(group_fields=["dag"], aggregation_field="task", aggregation_name=self._component_name + details)) dag_host_task_count = read_stream \ .filter("dag is not NULL") \ .filter("hostname is not NULL") \ .filter("task is not NULL") \ .aggregate(Count(group_fields=["dag", "hostname", "task"], aggregation_name=self._component_name + details)) bbc_dag_subtask_message_itv_generated_with_task_count = read_stream \ .filter("dag is not NULL") \ .filter("task is not NULL") \ .where("dag like '%bbc%' and subtask_message like '%ITV generated%'") \ .aggregate(Count(group_fields=["dag", "task"], aggregation_name=self._component_name + ".highres.itv_gen")) return [ number_of_unique_tasks_in_the_dags, dag_host_task_count, bbc_dag_subtask_message_itv_generated_with_task_count ]
def distinct_cpe_factory_reset_report(self, common_settings_pipeline): return common_settings_pipeline \ .select("@timestamp", col("`cpe.factoryResetState`").alias("cpe_factory_reset_state"), "viewer_id") \ .where("cpe_factory_reset_state is not NULL") \ .aggregate(DistinctCount(aggregation_field="viewer_id", group_fields=["cpe_factory_reset_state"], aggregation_name=self._component_name))
def __restarted_stbs_count_per_firmware(self, common_vm_stat_pipeline): return common_vm_stat_pipeline \ .select("@timestamp", "uptime", "viewer_id", explode("software_versions.version") .alias("software_version")) \ .where((col("uptime") >= 0) & (col("uptime") <= 100)) \ .aggregate(DistinctCount(aggregation_field="viewer_id", group_fields=["software_version"], aggregation_name=self._component_name + ".restarted_stbs_per_frimware", aggregation_window=self._get_interval_duration("uniqCountWindow")))
def distinct_cpe_with_selected_subtitles_track_language( self, common_settings_pipeline): return common_settings_pipeline \ .select("@timestamp", col("`profile.subLang`").alias("profile_sub_lang"), "viewer_id") \ .where("profile_sub_lang is not NULL") \ .aggregate(DistinctCount(aggregation_field="viewer_id", group_fields=["profile_sub_lang"], aggregation_name=self._component_name + ".cpe_with_selected_subtitles_language"))
def distinct_cpe_count_recently_used_settings_items( self, common_settings_pipeline): return common_settings_pipeline \ .select("@timestamp", "viewer_id", explode("`profile.recentlyUsedSettingsItems`").alias("settings_items")) \ .aggregate(DistinctCount(aggregation_field="viewer_id", group_fields=["settings_items"], aggregation_name=self._component_name + ".recently_used"))
def _process_pipeline(self, read_stream): stb_ids = read_stream.withColumn("stb_id", col("header.viewerID")) requests_count = stb_ids.aggregate( Count(aggregation_name=self._component_name + ".request")) stb_ids_distinct_count = stb_ids.aggregate( DistinctCount(aggregation_field="stb_id", aggregation_name=self._component_name)) return [requests_count, stb_ids_distinct_count]
def distinct_cpe_count_with_upgrade_status(self, common_settings_pipeline): return common_settings_pipeline \ .select("@timestamp", "viewer_id", col("`cpe.country`").alias("cpe_country"), col("`cpe.upgradeStatus`").alias("upgrade_status")) \ .where("cpe_country is not NULL") \ .where("upgrade_status is not NULL") \ .aggregate(DistinctCount(aggregation_field="viewer_id", group_fields=["cpe_country", "upgrade_status"], aggregation_name=self._component_name + ".cpe_count_with_upgrade_status"))
def distinct_audio_brands_paired_with_each_cpe(self, common_settings_pipeline): return common_settings_pipeline \ .select("@timestamp", col("`cpe.quicksetPairedDevicesInfo`").getItem("amp").getItem("brand").alias("brand"), col("`cpe.quicksetPairedDevicesInfo`").getItem("amp").getItem("isPaired").alias("is_paired"), "viewer_id") \ .where("brand is not NULL") \ .withColumn("brand", when(col("brand") == "", "unknown_brands").otherwise(col("brand"))) \ .aggregate(DistinctCount(aggregation_field="viewer_id", group_fields=["is_paired", "brand"], aggregation_name=self._component_name + ".audio_brands_paired_with_each_cpe"))
def _agg_unique_count(self, stream, type): """ Aggregate uservice - he component call duration :param stream: :return: """ aggregation = DistinctCount(group_fields=["app", type], aggregation_field="status", aggregation_name=self._component_name) return stream.withColumn(type, lit("all")).aggregate(aggregation)
def __process_common_events(self, read_stream): """ Aggregation for events to calculate common metrics :param read_stream: input stream with events from dag kafka topic :return: list of aggregated metrics """ dag_count = read_stream \ .select(col("hostname"), col("@timestamp"), col("dag")) \ .aggregate(DistinctCount(group_fields=["hostname"], aggregation_field="dag", aggregation_name=self._component_name)) success_and_failures_counts = read_stream \ .select(col("@timestamp"), col("task"), col("dag"), col("message")) \ .where(col("message").like("Task exited with return code%")) \ .withColumn("status", when(col("message").like("Task exited with return code 0%"), lit("success")) .otherwise(lit("failure"))) \ .aggregate(Count(group_fields=["dag", "task", "status"], aggregation_name=self._component_name)) return [dag_count, success_and_failures_counts]
def __distinct_active_hosts(self, read_stream): return read_stream \ .aggregate(DistinctCount(aggregation_field="hostname", aggregation_name=self._component_name))
def __distinct_stb_high_memory_usage(self, read_stream): return read_stream \ .where("usedKb > 1677721") \ .aggregate(DistinctCount(aggregation_field="viewer_id", aggregation_name=self._component_name + ".high"))
def __distinct_stb_med_memory_usage(self, read_stream): return read_stream \ .where("usedKb >= 1468006 and usedKb <= 1677721") \ .aggregate(DistinctCount(aggregation_field="viewer_id", aggregation_name=self._component_name + ".med"))
def __distinct_stb_low_memory_usage(self, read_stream): return read_stream \ .where("usedKb < 1468006") \ .aggregate(DistinctCount(aggregation_field="viewer_id", aggregation_name=self._component_name + ".low"))
def __search_failures(self, error_report_stream): return error_report_stream \ .where((col("code") == 8400) & (col("ctxt.search").isNotNull())) \ .aggregate(DistinctCount(aggregation_field="viewer_id", aggregation_name=self._component_name + ".search_failures", aggregation_window=self._uniq_count_window))
def __rev_buffer_playouts(self, error_report_stream): return error_report_stream \ .where((col("code") >= 2100) & (col("code") <= 2199)) \ .aggregate(DistinctCount(aggregation_field="viewer_id", aggregation_name=self._component_name + ".rev_buffer_playouts", aggregation_window=self._uniq_count_window))
def __ltv_playout_errors(self, error_report_stream): return error_report_stream \ .where((col("code") == 2004) | (col("code") == 2002)) \ .aggregate(DistinctCount(aggregation_field="viewer_id", aggregation_name=self._component_name + ".ltv_playout_errors", aggregation_window=self._uniq_count_window))
def __host_names_unique_count(self, events): return events \ .aggregate(DistinctCount(aggregation_field="hostname", aggregation_name=self._component_name))
def total_cpe_net_config_for_wifi_ethernet_channels( self, common_net_configuration_pipeline): return common_net_configuration_pipeline \ .where("type is not NULL") \ .aggregate(DistinctCount(aggregation_field="viewer_id", group_fields=["type"], aggregation_name=self._component_name))
def distinct_total_net_config_enabled(self, common_net_configuration_pipeline): return common_net_configuration_pipeline \ .where("enabled is not NULL") \ .aggregate(DistinctCount(aggregation_field="viewer_id", group_fields=["enabled"], aggregation_name=self._component_name))
def __count_distinct_active_stb_wifi(self, common_wifi_pipeline): return common_wifi_pipeline \ .where("rxKbps > 0") \ .aggregate(DistinctCount(aggregation_field="viewer_id", aggregation_name=self._component_name + ".active"))
def __distinct_total_wifi_network_types_count(self, common_wifi_pipeline): return common_wifi_pipeline \ .where((col("rxKbps") > 0) | (col("txKbps") > 0)) \ .aggregate(DistinctCount(aggregation_field="viewer_id", aggregation_name=self._component_name + ".network"))
def __hostname_unique_count(self, read_stream): return read_stream. \ aggregate(DistinctCount(aggregation_field="hostname", aggregation_name=self._component_name))