def general_metrics_create(request): """ Endpoint for general metrics aggregation """ application = request.context.resource if request.method.upper() == "OPTIONS": return check_cors(request, application) else: check_cors(request, application, should_return=False) params = dict(request.params.copy()) proto_version = parse_proto(params.get("protocol_version", "")) payload = request.unsafe_json_body sequence_accepted = request.matched_route.name == "api_general_metrics" if sequence_accepted: if application.allow_permanent_storage: schema = GeneralMetricsPermanentListSchema().bind( utcnow=datetime.datetime.utcnow() ) else: schema = GeneralMetricsListSchema().bind(utcnow=datetime.datetime.utcnow()) else: if application.allow_permanent_storage: schema = GeneralMetricPermanentSchema().bind( utcnow=datetime.datetime.utcnow() ) else: schema = GeneralMetricSchema().bind(utcnow=datetime.datetime.utcnow()) deserialized_metrics = schema.deserialize(payload) if sequence_accepted is False: deserialized_metrics = [deserialized_metrics] rate_limiting( request, application, "per_application_metrics_rate_limit", len(deserialized_metrics), ) tasks.add_metrics.delay( application.resource_id, params, deserialized_metrics, proto_version ) log.info( "METRICS call {} {} client:{}".format( application.resource_name, proto_version, request.headers.get("user_agent") ) ) return "OK: Metrics accepted"
def add_uptime_stats(params, metric): proto_version = parse_proto(params.get("protocol_version")) try: application = ApplicationService.by_id_cached()(metric["resource_id"]) application = DBSession.merge(application, load=False) if not application: return start_interval = convert_date(metric["timestamp"]) start_interval = start_interval.replace(second=0, microsecond=0) new_metric = UptimeMetric( start_interval=start_interval, response_time=metric["response_time"], status_code=metric["status_code"], is_ok=metric["is_ok"], location=metric.get("location", 1), tries=metric["tries"], resource_id=application.resource_id, owner_user_id=application.owner_user_id, ) DBSession.add(new_metric) DBSession.flush() add_metrics_uptime([new_metric.es_doc()]) if metric["is_ok"]: event_types = [Event.types["uptime_alert"]] statuses = [Event.statuses["active"]] # get events older than 5 min events = EventService.by_type_and_status( event_types, statuses, older_than=(datetime.utcnow() - timedelta(minutes=6)), app_ids=[application.resource_id], ) for event in events: event.close() else: UptimeMetricService.check_for_alert(application, metric=metric) action = "METRICS UPTIME" metrics_msg = "%s: %s, proto:%s" % (action, str(application), proto_version) log.info(metrics_msg) session = DBSession() mark_changed(session) return True except Exception as exc: print_traceback(log) add_uptime_stats.retry(exc=exc)
def reports_create(request): """ Endpoint for exception and slowness reports """ # route_url('reports') application = request.context.resource if request.method.upper() == 'OPTIONS': return check_cors(request, application) else: check_cors(request, application, should_return=False) params = dict(request.params.copy()) proto_version = parse_proto(params.get('protocol_version', '')) payload = request.unsafe_json_body sequence_accepted = request.matched_route.name == 'api_reports' if sequence_accepted: schema = ReportListSchema_0_5().bind( utcnow=datetime.datetime.utcnow()) else: schema = ReportSchema_0_5().bind( utcnow=datetime.datetime.utcnow()) deserialized_reports = schema.deserialize(payload) if sequence_accepted is False: deserialized_reports = [deserialized_reports] if deserialized_reports: rate_limiting(request, application, 'per_application_reports_rate_limit', len(deserialized_reports)) # pprint.pprint(deserialized_reports) tasks.add_reports.delay(application.resource_id, params, deserialized_reports) log.info('REPORT call %s, %s client:%s' % ( application, proto_version, request.headers.get('user_agent')) ) return 'OK: Reports accepted'
def logs_create(request): """ Endpoint for log aggregation """ application = request.context.resource if request.method.upper() == "OPTIONS": return check_cors(request, application) else: check_cors(request, application, should_return=False) params = dict(request.params.copy()) proto_version = parse_proto(params.get("protocol_version", "")) payload = request.unsafe_json_body sequence_accepted = request.matched_route.name == "api_logs" if sequence_accepted: if application.allow_permanent_storage: schema = LogListPermanentSchema().bind(utcnow=datetime.datetime.utcnow()) else: schema = LogListSchema().bind(utcnow=datetime.datetime.utcnow()) else: if application.allow_permanent_storage: schema = LogSchemaPermanent().bind(utcnow=datetime.datetime.utcnow()) else: schema = LogSchema().bind(utcnow=datetime.datetime.utcnow()) deserialized_logs = schema.deserialize(payload) if sequence_accepted is False: deserialized_logs = [deserialized_logs] rate_limiting( request, application, "per_application_logs_rate_limit", len(deserialized_logs) ) # pprint.pprint(deserialized_logs) # we need to split those out so we can process the pkey ones one by one non_pkey_logs = [ log_dict for log_dict in deserialized_logs if not log_dict["primary_key"] ] pkey_dict = {} # try to process the logs as best as we can and group together to reduce # the amount of for log_dict in deserialized_logs: if log_dict["primary_key"]: key = (log_dict["primary_key"], log_dict["namespace"]) if not key in pkey_dict: pkey_dict[key] = [] pkey_dict[key].append(log_dict) if non_pkey_logs: log.debug("%s non-pkey logs received: %s" % (application, len(non_pkey_logs))) tasks.add_logs.delay(application.resource_id, params, non_pkey_logs) if pkey_dict: logs_to_insert = [] for primary_key_tuple, payload in pkey_dict.items(): sorted_logs = sorted(payload, key=lambda x: x["date"]) logs_to_insert.append(sorted_logs[-1]) log.debug("%s pkey logs received: %s" % (application, len(logs_to_insert))) tasks.add_logs.delay(application.resource_id, params, logs_to_insert) log.info( "LOG call %s %s client:%s" % (application, proto_version, request.headers.get("user_agent")) ) return "OK: Logs accepted"
def request_metrics_create(request): """ Endpoint for performance metrics, aggregates view performance stats and converts them to general metric row """ application = request.context.resource if request.method.upper() == "OPTIONS": return check_cors(request, application) else: check_cors(request, application, should_return=False) params = dict(request.params.copy()) proto_version = parse_proto(params.get("protocol_version", "")) payload = request.unsafe_json_body schema = MetricsListSchema() dataset = schema.deserialize(payload) rate_limiting( request, application, "per_application_metrics_rate_limit", len(dataset) ) # looping report data metrics = {} for metric in dataset: server_name = metric.get("server", "").lower() or "unknown" start_interval = convert_date(metric["timestamp"]) start_interval = start_interval.replace(second=0, microsecond=0) for view_name, view_metrics in metric["metrics"]: key = "%s%s%s" % (metric["server"], start_interval, view_name) if start_interval not in metrics: metrics[key] = { "requests": 0, "main": 0, "sql": 0, "nosql": 0, "remote": 0, "tmpl": 0, "custom": 0, "sql_calls": 0, "nosql_calls": 0, "remote_calls": 0, "tmpl_calls": 0, "custom_calls": 0, "start_interval": start_interval, "server_name": server_name, "view_name": view_name, } metrics[key]["requests"] += int(view_metrics["requests"]) metrics[key]["main"] += round(view_metrics["main"], 5) metrics[key]["sql"] += round(view_metrics["sql"], 5) metrics[key]["nosql"] += round(view_metrics["nosql"], 5) metrics[key]["remote"] += round(view_metrics["remote"], 5) metrics[key]["tmpl"] += round(view_metrics["tmpl"], 5) metrics[key]["custom"] += round(view_metrics.get("custom", 0.0), 5) metrics[key]["sql_calls"] += int(view_metrics.get("sql_calls", 0)) metrics[key]["nosql_calls"] += int(view_metrics.get("nosql_calls", 0)) metrics[key]["remote_calls"] += int(view_metrics.get("remote_calls", 0)) metrics[key]["tmpl_calls"] += int(view_metrics.get("tmpl_calls", 0)) metrics[key]["custom_calls"] += int(view_metrics.get("custom_calls", 0)) if not metrics[key]["requests"]: # fix this here because validator can't metrics[key]["requests"] = 1 # metrics dict is being built to minimize # the amount of queries used # in case we get multiple rows from same minute normalized_metrics = [] for metric in metrics.values(): new_metric = { "namespace": "appenlight.request_metric", "timestamp": metric.pop("start_interval"), "server_name": metric["server_name"], "tags": list(metric.items()), } normalized_metrics.append(new_metric) tasks.add_metrics.delay( application.resource_id, params, normalized_metrics, proto_version ) log.info( "REQUEST METRICS call {} {} client:{}".format( application.resource_name, proto_version, request.headers.get("user_agent") ) ) return "OK: request metrics accepted"
def add_reports(resource_id, request_params, dataset, **kwargs): proto_version = parse_proto(request_params.get("protocol_version", "")) current_time = datetime.utcnow().replace(second=0, microsecond=0) try: # we will store solr docs here for single insert es_report_docs = {} es_report_group_docs = {} resource = ApplicationService.by_id(resource_id) tags = [] es_slow_calls_docs = {} es_reports_stats_rows = {} for report_data in dataset: # build report details for later added_details = 0 report = Report() report.set_data(report_data, resource, proto_version) report._skip_ft_index = True # find latest group in this months partition report_group = ReportGroupService.by_hash_and_resource( report.resource_id, report.grouping_hash, since_when=datetime.utcnow().date().replace(day=1), ) occurences = report_data.get("occurences", 1) if not report_group: # total reports will be +1 moment later report_group = ReportGroup( grouping_hash=report.grouping_hash, occurences=0, total_reports=0, last_report=0, priority=report.priority, error=report.error, first_timestamp=report.start_time, ) report_group._skip_ft_index = True report_group.report_type = report.report_type report.report_group_time = report_group.first_timestamp add_sample = pick_sample(report_group.occurences, report_type=report_group.report_type) if add_sample: resource.report_groups.append(report_group) report_group.reports.append(report) added_details += 1 DBSession.flush() if report.partition_id not in es_report_docs: es_report_docs[report.partition_id] = [] es_report_docs[report.partition_id].append(report.es_doc()) tags.extend(list(report.tags.items())) slow_calls = report.add_slow_calls(report_data, report_group) DBSession.flush() for s_call in slow_calls: if s_call.partition_id not in es_slow_calls_docs: es_slow_calls_docs[s_call.partition_id] = [] es_slow_calls_docs[s_call.partition_id].append( s_call.es_doc()) # try generating new stat rows if needed else: # required for postprocessing to not fail later report.report_group = report_group stat_row = ReportService.generate_stat_rows( report, resource, report_group) if stat_row.partition_id not in es_reports_stats_rows: es_reports_stats_rows[stat_row.partition_id] = [] es_reports_stats_rows[stat_row.partition_id].append( stat_row.es_doc()) # see if we should mark 10th occurence of report last_occurences_10 = int(math.floor(report_group.occurences / 10)) curr_occurences_10 = int( math.floor((report_group.occurences + report.occurences) / 10)) last_occurences_100 = int(math.floor(report_group.occurences / 100)) curr_occurences_100 = int( math.floor( (report_group.occurences + report.occurences) / 100)) notify_occurences_10 = last_occurences_10 != curr_occurences_10 notify_occurences_100 = last_occurences_100 != curr_occurences_100 report_group.occurences = ReportGroup.occurences + occurences report_group.last_timestamp = report.start_time report_group.summed_duration = ReportGroup.summed_duration + report.duration summed_duration = ReportGroup.summed_duration + report.duration summed_occurences = ReportGroup.occurences + occurences report_group.average_duration = summed_duration / summed_occurences report_group.run_postprocessing(report) if added_details: report_group.total_reports = ReportGroup.total_reports + 1 report_group.last_report = report.id report_group.set_notification_info( notify_10=notify_occurences_10, notify_100=notify_occurences_100) DBSession.flush() report_group.get_report().notify_channel(report_group) if report_group.partition_id not in es_report_group_docs: es_report_group_docs[report_group.partition_id] = [] es_report_group_docs[report_group.partition_id].append( report_group.es_doc()) action = "REPORT" log_msg = "%s: %s %s, client: %s, proto: %s" % ( action, report_data.get("http_status", "unknown"), str(resource), report_data.get("client"), proto_version, ) log.info(log_msg) total_reports = len(dataset) redis_pipeline = Datastores.redis.pipeline(transaction=False) key = REDIS_KEYS["counters"]["reports_per_minute"].format(current_time) redis_pipeline.incr(key, total_reports) redis_pipeline.expire(key, 3600 * 24) key = REDIS_KEYS["counters"]["events_per_minute_per_user"].format( resource.owner_user_id, current_time) redis_pipeline.incr(key, total_reports) redis_pipeline.expire(key, 3600) key = REDIS_KEYS["counters"]["reports_per_hour_per_app"].format( resource_id, current_time.replace(minute=0)) redis_pipeline.incr(key, total_reports) redis_pipeline.expire(key, 3600 * 24 * 7) redis_pipeline.sadd( REDIS_KEYS["apps_that_got_new_data_per_hour"].format( current_time.replace(minute=0)), resource_id, ) redis_pipeline.execute() add_reports_es(es_report_group_docs, es_report_docs) add_reports_slow_calls_es(es_slow_calls_docs) add_reports_stats_rows_es(es_reports_stats_rows) return True except Exception as exc: print_traceback(log) if celery.conf["CELERY_EAGER_PROPAGATES_EXCEPTIONS"]: raise add_reports.retry(exc=exc)