def _process_pipeline(self, read_stream): duration_stream = read_stream \ .where("started_script == '/apps/ThinkAnalytics/ContentIngest/bin/ingest.sh'") \ .aggregate(Avg(group_fields=["hostname"], aggregation_field="duration", aggregation_name=self._component_name + ".ingest")) return [duration_stream]
def __avg_frequency_stb_by_report_index(self, read_stream): return read_stream \ .where("locked = true") \ .where("frequency is not NULL") \ .aggregate(Avg(group_fields=["index"], aggregation_field="frequency", aggregation_name=self._component_name + ".locked"))
def _process_pipeline(self, read_stream): stream = read_stream \ .withColumn("process", explode(col("TopProcesses").getItem("processes"))) \ .selectExpr("process.*", "`@timestamp`") return [stream.aggregate(Avg(group_fields="name", aggregation_field=field, aggregation_name=self._component_name)) for field in ["rss", "fds", "threads", "vsz"]]
def _agg_end2end(self, stream): """ Aggregate end to end calls from STB - uservice :param stream: :return: """ kwargs = {'aggregation_field': "duration_ms"} aggregations = [Max(**kwargs), Min(**kwargs), Avg(**kwargs)] return stream.aggregate(CompoundAggregation(aggregations=aggregations, aggregation_name=self._component_name, group_fields=["tenant", "app", "status"]))
def _agg_uservice2component_duration(self, stream): """ Aggregate uservice - he component call duration :param stream: :return: """ kwargs = {'aggregation_field': "duration_ms"} aggregations = [Max(**kwargs), Min(**kwargs), Avg(**kwargs)] return stream.aggregate(CompoundAggregation(aggregations=aggregations, aggregation_name=self._component_name, group_fields=["tenant", "app", "dest", "host"]))
def _process_pipeline(self, read_stream): # filter useless data filtered_stream = read_stream.where( (col("duration_ms").cast("long") != 0) & ~ (col("requested_url").startswith("GET /info") | col("requested_url").startswith("GET /prometheus")) ) mapped_stream = filtered_stream \ .withColumn("country", when(col("stack").isNotNull(), regexp_extract("stack", r".*-(\w+)$", 1)) .otherwise("undefined")) average_duration = mapped_stream.aggregate( Avg(group_fields=["country", "host", "app", "app_version", "api_method"], aggregation_field="duration_ms", aggregation_name=self._component_name)) count_by_status = mapped_stream.aggregate( Count(group_fields=["country", "host", "app", "app_version", "api_method", "status"], aggregation_name=self._component_name)) request_stream = read_stream \ .where(col("header_x-dev").isNotNull()) \ .withColumn("country", when(col("stack").isNotNull(), regexp_extract("stack", r".*-(\w+)$", 1)) .otherwise("undefined")) count_by_app = request_stream.aggregate( Count(group_fields=["country", "app"], aggregation_name=self._component_name + ".requests")) count_by_app_with_status = request_stream \ .where(col("status").isNotNull()) \ .withColumn("status", custom_translate_regex( source_field=col("status"), mapping={r"^2\d\d": "successful"}, default_value="failure")) \ .aggregate(Count(group_fields=["country", "app", "status"], aggregation_name=self._component_name + ".requests")) count_stb_requests = request_stream \ .aggregate(Count(group_fields=["country", "header_x-dev"], aggregation_name=self._component_name + ".requests")) return [average_duration, count_by_status, count_stb_requests, count_by_app, count_by_app_with_status]
def aggregate(aggregation_field, group): """ Build aggregated stream for each metric :param metric_name: name of mem/net metric which needs to be averaged. :return: list of streams """ aggregation = Avg(group_fields=["res_kind", "group", "name"], aggregation_field=aggregation_field, aggregation_name=self._component_name) agg_stream = read_stream \ .select("@timestamp", "group", "res_kind", "name", col("metrics.{}".format(aggregation_field)).alias(aggregation_field)) \ .filter( (col("group") == group) & (col("res_kind") == "VirtualMachine") & (col(aggregation_field).isNotNull())) \ .withColumn("name", regexp_replace("name", r"\.", "-")) \ .aggregate(aggregation) return agg_stream
def for_each_metric(metric_name): """ Build aggregated stream for each metric :param metric_name: name of cpu metric which needs to be averaged. :return: list of streams """ aggregation = Avg(group_fields=["res_kind", "group", "name"], aggregation_field=metric_name, aggregation_name=self._component_name) agg_stream = read_stream \ .select("@timestamp", "group", "res_kind", "name", "metrics.*") \ .select("@timestamp", "group", "res_kind", "name", metric_name) \ .filter( (col("group") == "cpu") & (col("res_kind") == "VirtualMachine") & (col(metric_name).isNotNull())) \ .withColumn("name", regexp_replace("name", r"\.", "-")) \ .aggregate(aggregation) return agg_stream
def _process_pipeline(self, json_stream): stream = json_stream \ .selectExpr("GraphicsMemoryUsage.*", "`@timestamp`") \ .withColumn("mapping", when(col("mapping") == "CRR (SECURE)", "crr_secure") .when(col("mapping") == "GFX", "gfx") .when(col("mapping") == "MAIN", "main") .when(col("mapping") == "PICBUF0", "picbuf0") .when(col("mapping") == "PICBUF1", "picbuf1") .when(col("mapping") == "SAGE (SECURE)", "sage_secure") .otherwise("unclassified")) \ .where("mapping != 'unclassified'") return [ stream.aggregate( Avg(group_fields="mapping", aggregation_field=field, aggregation_name=self._component_name)) for field in ["totalKb", "peakKb", "freeKb"] ]
def _process_pipeline(self, uxp_stream): """ Returns list with streams for aggragated fields. :param uxp_stream: input stream :return: list of processed streams """ filtered_exp_stream = uxp_stream \ .where(uxp_stream.url.isin(self.__processing_urls)) \ .select(custom_translate_like(col("url"), self.__url_mapping, lit("undefined")).alias("action"), col("status code").alias("statusCode"), col("responseTime"), col("@timestamp")) uxp_count_stream = filtered_exp_stream \ .aggregate(Count(group_fields=["action", "statusCode"], aggregation_name=self._component_name)) uxp_avg_response_time_stream = filtered_exp_stream \ .aggregate(Avg(aggregation_field="responseTime", group_fields=["action"], aggregation_name=self._component_name)) return [uxp_count_stream, uxp_avg_response_time_stream]
def _process_pipeline(self, read_stream): count_by_payload_status_stream = read_stream \ .aggregate(Count(group_fields=["payload_status"], aggregation_name=self._component_name)) count_by_status_stream = read_stream \ .aggregate(Count(group_fields=["status"], aggregation_name=self._component_name)) avg_latency_by_uri_stream = read_stream \ .aggregate(Avg(group_fields=["uri"], aggregation_field="latency", aggregation_name=self._component_name, use_udf=True)) count_by_uri_and_status_stream = read_stream \ .aggregate(Count(group_fields=["uri", "status"], aggregation_name=self._component_name, use_udf=True)) return [ count_by_payload_status_stream, count_by_status_stream, avg_latency_by_uri_stream, count_by_uri_and_status_stream ]
def __wireless_average_downstream_kbps(self, common_wifi_pipeline): return common_wifi_pipeline \ .where("rxKbps is not NULL") \ .aggregate(Avg(aggregation_field="rxKbps", aggregation_name=self._component_name + ".downstream_kbps"))
def __average_usage_low_priority_mode(self, common_vm_stat_pipeline, time_in_percents): return common_vm_stat_pipeline \ .select("@timestamp", col("nicePct")) \ .aggregate(Avg(aggregation_field="nicePct", aggregation_name=self._component_name + time_in_percents))
def __average_usage_cpu_in_wait(self, common_vm_stat_pipeline, time_in_percents): return common_vm_stat_pipeline \ .select("@timestamp", col("iowaitPct")) \ .aggregate(Avg(aggregation_field="iowaitPct", aggregation_name=self._component_name + time_in_percents))
def __average_usage_hardware_interrupt(self, common_vm_stat_pipeline, time_in_percents): return common_vm_stat_pipeline \ .select("@timestamp", col("hwIrqPct")) \ .aggregate(Avg(aggregation_field="hwIrqPct", aggregation_name=self._component_name + time_in_percents))
def __average_uptime_across_stb(self, common_vm_stat_pipeline): return common_vm_stat_pipeline \ .select("@timestamp", col("uptime").alias("uptime_sec")) \ .aggregate(Avg(aggregation_field="uptime_sec", aggregation_name=self._component_name))
def __average_user_active_mode(self, common_vm_stat_pipeline, time_in_percents): return common_vm_stat_pipeline \ .select("@timestamp", col("userPct")) \ .aggregate(Avg(aggregation_field="userPct", aggregation_name=self._component_name + time_in_percents))
def __avg_memory_free_kb(self, read_stream): return read_stream \ .where("freeKb is not NULL") \ .aggregate(Avg(aggregation_field="freeKb", aggregation_name=self._component_name))
def __avg_response_time_by_method(self, read_stream): return read_stream \ .where("method is not null") \ .aggregate(Avg(group_fields=["hostname", "method"], aggregation_field="response_time", aggregation_name=self._component_name))
def __avg_response_time(self, read_stream): return read_stream \ .aggregate(Avg(group_fields=["hostname"], aggregation_field="response_time", aggregation_name=self._component_name))
def __avg_snr(self, read_stream): return read_stream \ .where("SNR is not NULL") \ .aggregate(Avg(aggregation_field="SNR", aggregation_name=self._component_name))
def average_temperature(self, common_temperature_pipeline): return common_temperature_pipeline \ .where(col("temperature") >= 0) \ .aggregate(Avg(aggregation_field="temperature", group_fields=["name"], aggregation_name=self._component_name))
def __avg_duration(self, read_stream): return read_stream\ .where("level == 'INFO'") \ .aggregate(Avg(group_fields=["hostname"], aggregation_field="duration", aggregation_name=self._component_name))
def __avg_signal_level_dbm(self, read_stream): return read_stream \ .where("signalLevel is not NULL") \ .aggregate(Avg(aggregation_field="signalLevel", aggregation_name=self._component_name + ".dbm"))
def __ethernet_average_downstream_kbps(self, read_stream): return read_stream \ .where("rxKbps is not NULL") \ .aggregate(Avg(aggregation_field="rxKbps", aggregation_name=self._component_name + ".downstream_kbps"))