def do_request_for_scaling(alert_id): alert = alert_db.get_alert(alert_id) ns_id = alert['ns_id'] ns_status = ns_db.get_ns_status(ns_id) current_il = ns_db.get_ns_il(alert['ns_id']) rule_actions = alert['ruleActions'] for rule_action in rule_actions: if rule_action['scaleNsToLevelData'][ 'nsInstantiationLevel'] == current_il: log_queue.put([ "DEBUG", "Current nsInstantiationLevel for nsId: " + ns_id + 'and Alert nsInstantiationLevel is the same' ]) continue if ns_status in ["FAILED", "TERMINATED", "INSTANTIATING"]: log_queue.put([ "DEBUG", "Current Status is " + ns_status + " for nsId: " + ns_id ]) log_queue.put( ["DEBUG", "This status is not fit to scaling actions"]) continue log_queue.put(["DEBUG", "Do scaling request for alert: " + alert_id]) request_to_so_scale_ns(alert)
def do_request_for_scaling(alert_id): alert = alert_db.get_alert(alert_id) ns_id = alert['ns_id'] ns_status = ns_db.get_ns_status(ns_id) # adding code to allow auto-scaling of nested NSs # the alert and the autoscaling rules are defined in the nested descriptors nested_info = ns_db.get_nested_service_info(ns_id) if nested_info: # we need to look for the corresponding nested nsdId = alert['nsd_id'] nsId_tmp = ns_id + '_' + nsdId particular_nested_info = ns_db.get_particular_nested_service_info(ns_id, nsId_tmp) current_il = particular_nested_info['nested_il'] else: current_il = ns_db.get_ns_il(alert['ns_id']) rule_actions = alert['ruleActions'] for rule_action in rule_actions: if rule_action['scaleNsToLevelData']['nsInstantiationLevel'] == current_il: log_queue.put(["DEBUG", "Current nsInstantiationLevel for nsId: " + ns_id + 'and Alert nsInstantiationLevel is the same']) continue if ns_status in ["FAILED", "TERMINATED", "INSTANTIATING", "SCALING"]: log_queue.put(["DEBUG","Current Status is " + ns_status + " for nsId: " + ns_id ]) log_queue.put(["DEBUG", "This status is not fit to scaling actions"]) continue log_queue.put(["DEBUG", "Do scaling request for alert: " + alert_id]) request_to_so_scale_ns(alert)
def update_ns_aiml_scale_work(nsId, aiml_scaling_info): """ After the scaling produced by the AIML notification, the spark job has to be resubmitted and the new IL published in the kafka topic Parameters ---------- nsId: String with the Network Service Id aiml_scaling_info: dict Dictionary with the information generated when creating the scaling aiml work Returns ------- """ # steps: log_queue.put([ "DEBUG", "Updating the AIML info after scaling for nsId: %s and info:" % nsId ]) log_queue.put(["DEBUG", json.dumps(aiml_scaling_info, indent=4)]) kafka_topic = aiml_scaling_info["topicId"] streaming_class = aiml_scaling_info["streamingClass"] model_name = aiml_scaling_info["model"] collectionPeriod = aiml_scaling_info["collectionPeriod"] # 1 - restart spark job # spark_job_id = start_spark_streaming_job(nsId, kafka_topic, streaming_class, model_name) status_file = spark_folder + "/" + kafka_topic + ".txt" spark_job_id = start_spark_streaming_job(nsId, kafka_topic, streaming_class, model_name, \ collectionPeriod, kafka_ip + ":" + kafka_port, alert_target, status_file) aiml_scaling_info["streamingJobId"] = spark_job_id # 2 - publish the IL in the kafka topic currentIL = ns_db.get_ns_il(nsId) #current_IL = { "key": "currentIL", # "value": currentIL} current_IL = { "type_message": "nsStatusMetrics", "metric": { "__name__": "nsInstantiationLevel", "nsId": nsId, }, "value": currentIL } monitoring.publish_json_kafka(kafka_topic, currentIL) # 3 - update the db log_queue.put(["DEBUG", "New scaling info: "]) log_queue.put(["DEBUG", json.dumps(aiml_scaling_info, indent=4)]) ns_db.set_aiml_info(nsId, "scaling", aiml_scaling_info)
def scale_ns_process(nsId, body): """ Performs the scaling of the service identified by "nsId" according to the info at body Parameters ---------- nsId: string Identifier of the Network Service Instance. body: request body including scaling operation Returns ------- """ log_queue.put( ["INFO", "scale_ns_process with nsId %s, body %s" % (nsId, body)]) # get the nsdId that corresponds to nsId nsdId = ns_db.get_nsdId(nsId) # get current instantiation level current_df = ns_db.get_ns_df(nsId) current_il = ns_db.get_ns_il(nsId) # first get the ns and vnfs descriptors nsd_json = nsd_db.get_nsd_json(nsdId, None) # for each vnf in the NSD, get its json descriptor vnfdIds = nsd_json["nsd"]["vnfdId"] vnfds_json = {} for vnfdId in vnfdIds: vnfds_json[vnfdId] = vnfd_db.get_vnfd_json(vnfdId, None) #request RO sap_info_pre_scaling = ns_db.get_ns_sap_info(nsId) rooe.scale_ns(nsId, nsd_json, vnfds_json, body, current_df, current_il) # maybe we have to update the monitoring jobs: we assume that new performance monitoring jobs # will be similar to one already present sap_info = ns_db.get_ns_sap_info(nsId) log_queue.put(["INFO", "new sapInfo after scaling: %s" % (sap_info)]) monitoring.update_ns_monitoring(nsId, nsd_json, vnfds_json, sap_info) log_queue.put([ "DEBUG", "monitoring exporters updated after scaling for nsId %s" % (nsId) ]) # update alerts: it is not needed log_queue.put(["INFO", "scale_ns_process finished for nsId %s" % (nsId)])
def post(self): #data_json = request.data # data = json.loads(data_json) data = request.get_json(force=True) if "alerts" in data: alerts = data['alerts'] for alert in alerts: labels = (alert['labels']) str_starts_at = str(alert['startsAt']) alertname = labels["alertname"] log_massage = "Received alert: " + alertname + " startsAt: " + str_starts_at + " status: " + alert['status'] log_queue.put(["INFO", log_massage]) if alert['status'] == 'resolved': alert_db.set_timestamp(alertname, "") continue if alert_db.exists_alert_id(alertname): if is_problem_resolved(alert) == False: log_queue.put(["DEBUG", "Alert is not resolved= " + alertname + " start date = " + str_starts_at]) do_request_for_scaling(alertname) continue else: continue # checks if this log from elastalert if "alertname" in data: str_starts_at = str(data['startsAt']) date_time_obj = datetime.strptime(str_starts_at, "%a %b %d %H:%M:%S %Z %Y") str_starts_at = date_time_obj.isoformat() alertname = data["alertname"] log_massage = "Received log alert: " + alertname + " startsAt: " + str_starts_at log_queue.put(["INFO", log_massage]) if alert_db.exists_alert_id(alertname): alert = {'startsAt': str_starts_at} alert.update({"labels":{"alertname": alertname}}) if is_problem_resolved(alert) == False: log_queue.put(["DEBUG", "Alert is not resolved= " + alertname + " start date = " + str_starts_at]) do_request_for_scaling(alertname) if "aiml" in data: # added to manage the notifications from the execution of the aiml model curent_time = datetime.now(pytz.utc) notification = data["aiml"] log_queue.put(["DEBUG", "Notification from Spark Job: %s" % notification]) ns_id = notification["nsID"] nsInstantiationLevel = notification["nsInstantiationLevel"] # cpu_measurement = notification["cpu_measurement"] aiml_scaling_info = ns_db.get_aiml_info(ns_id, "scaling") currentIL = ns_db.get_ns_il(ns_id) if (aiml_scaling_info and (nsInstantiationLevel != currentIL)): curent_time2 = datetime.now(pytz.utc) timeout = curent_time2-curent_time log_queue.put(["INFO", "*****Time measure: SLAManager SLAManager webhook processing scaling: %s"%timeout]) log_queue.put(["DEBUG", "Generating a scaling operation for nsId: %s from currentIL: %s to newIL: %s" % (ns_id, currentIL, nsInstantiationLevel)]) # 1 - stop the spark job alert_configure.delete_spark_streaming_job(aiml_scaling_info["streamingJobId"]) log_queue.put(["INFO", "*****Time measure: SLAManager SLAManager webhook stopped spark job"]) # 1.5 - remove the kafka topic monitoring.delete_kafka_topic(aiml_scaling_info["topicId"]) log_queue.put(["INFO", "*****Time measure: SLAManager SLAManager webhook deleted kafka topic"]) # 2 - generate the scaling request scale_request = { "scaleType": "SCALE_NS", "scaleNsData": { "scaleNsToLevelData": { "nsInstantiationLevel": nsInstantiationLevel } }, "scaleTime": "0" } log_queue.put(["DEBUG", "AIML makes an scaling request for nsId: %s"% ns_id]) log_queue.put(["DEBUG", "AIML scale request:" ]) #log_queue.put(["DEBUG", json.dumps(scale_request, indent=4)]) make_request_to_so_nbi(ns_id, scale_request) log_queue.put(["INFO", "*****Time measure: SLAManager SLAManager webhook made scaling request"]) else: log_queue.put(["DEBUG", "Not generating a scaling operation for nsId: %s" % (ns_id)]) return "OK", 200
def scale_ns_process(nsId, body, nestedInfo=None): """ Performs the scaling of the service identified by "nsId" according to the info at body Parameters ---------- nsId: string Identifier of the Network Service Instance. body: request body including scaling operation Returns ------- """ log_queue.put([ "INFO", "*****Time measure for nsId: %s: SOEc SOEc scaling a nested/regular NS" % nsId ]) log_queue.put( ["INFO", "scale_ns_process with nsId %s, body %s" % (nsId, body)]) # get the nsdId that corresponds to nsId if nestedInfo: nsdId = next(iter(nestedInfo)) current_df = nestedInfo[nsdId][0] current_il = nestedInfo[nsdId][1] else: nsdId = ns_db.get_nsdId(nsId) # get current instantiation level current_df = ns_db.get_ns_df(nsId) current_il = ns_db.get_ns_il(nsId) # first get the ns and vnfs descriptors nsd_json = nsd_db.get_nsd_json(nsdId, None) # for each vnf in the NSD, get its json descriptor vnfdIds = nsd_json["nsd"]["vnfdId"] vnfds_json = {} for vnfdId in vnfdIds: vnfds_json[vnfdId] = vnfd_db.get_vnfd_json(vnfdId, None) #request RO sap_info_pre_scaling = ns_db.get_ns_sap_info(nsId) log_queue.put([ "INFO", "*****Time measure for nsId: %s: SOEc SOEc-ROE prepared info for scaling" % (nsId) ]) rooe.scale_ns(nsId, nsd_json, vnfds_json, body, current_df, current_il, nestedInfo) log_queue.put([ "INFO", "*****Time measure for nsId: %s: SOEc SOEc-ROE updated DBs scaling a NS" % (nsId) ]) # checks the result of scaling, maybe it has not be done due to lack of resources operationId = operation_db.get_operationId(nsId, "INSTANTIATION") if ((operation_db.get_operation_status(operationId) == "SUCCESSFULLY_DONE") and ns_db.get_ns_status(nsId) == "INSTANTIATED"): # maybe we have to update the monitoring jobs: we assume that new performance monitoring jobs # will be similar to one already present sap_info = ns_db.get_ns_sap_info(nsId) log_queue.put(["INFO", "new sapInfo after scaling: %s" % (sap_info)]) monitoring.update_ns_monitoring(nsId, nsd_json, vnfds_json, sap_info) log_queue.put([ "INFO", "*****Time measure for nsId: %s: SOEc SOEc updated monitoring info" % nsId ]) log_queue.put([ "DEBUG", "monitoring exporters updated after scaling for nsId %s" % (nsId) ]) # update alerts: it is not needed # however, in the case of aiml_scaling it is needed, to restart the spark job else: if ns_db.get_ns_status(nsId) == "INSTANTIATED": log_queue.put( ["DEBUG", "Scaling operation failed due to lack of resources"]) elif ns_db.get_ns_status(nsId) == "FAILED": log_queue.put( ["DEBUG", "Scaling operation failed at the MANO platform"]) aiml_scaling_info = ns_db.get_aiml_info(nsId, "scaling") if (aiml_scaling_info and (ns_db.get_ns_status(nsId) == "INSTANTIATED")): log_queue.put( ["DEBUG", "The AIML platform is triggering the scaling operation"]) alert_configure.update_ns_aiml_scale_work(nsId, aiml_scaling_info) log_queue.put([ "INFO", "*****Time measure for nsId: %s: SOEc SOEc updated AIML alert job" % nsId ]) log_queue.put(["INFO", "scale_ns_process finished for nsId %s" % (nsId)]) log_queue.put([ "INFO", "*****Time measure for nsId: %s: SOEc SOEc finished scaling a nested/regular NS" % (nsId) ]) notification_db.create_notification_record({ "nsId": nsId, "type": "fa-gears", "text": nsId + " SCALED", "time": datetime.now().strftime("%d/%m/%Y %H:%M:%S.%f") })
def post(self): #data_json = request.data # data = json.loads(data_json) data = request.get_json(force=True) if "alerts" in data: alerts = data['alerts'] for alert in alerts: labels = (alert['labels']) str_starts_at = str(alert['startsAt']) alertname = labels["alertname"] log_massage = "Received alert: " + alertname + " startsAt: " + str_starts_at + " status: " + alert[ 'status'] log_queue.put(["INFO", log_massage]) if alert['status'] == 'resolved': alert_db.set_timestamp(alertname, "") continue if alert_db.exists_alert_id(alertname): if is_problem_resolved(alert) == False: log_queue.put([ "DEBUG", "Alert is not resolved= " + alertname + " start date = " + str_starts_at ]) do_request_for_scaling(alertname) continue else: continue if "aiml" in data: # added to manage the notifications from the execution of the aiml model notification = data["aiml"] ns_id = notification["nsID"] nsInstantiationLevel = notification["nsInstantiationLevel"] aiml_scaling_info = ns_db.get_aiml_info(ns_id, "scaling") currentIL = ns_db.get_ns_il(ns_id) if (aiml_scaling_info and (nsInstantiationLevel != currentIL)): # 1 - stop the spark job alert_configure.delete_spark_streaming_job( aiml_scaling_info["streamingJobId"]) # 2 - generate the scaling request scale_request = { "scaleType": "SCALE_NS", "scaleNsData": { "scaleNsToLevelData": { "nsInstantiationLevel": nsInstantiationLevel } }, "scaleTime": "0" } log_queue.put([ "DEBUG", "AIML makes an scaling request for nsId: %s" % ns_id ]) log_queue.put(["DEBUG", "AIML scale request:"]) log_queue.put(["DEBUG", json.dumps(scale_request, indent=4)]) make_request_to_so_nbi(ns_id, scale_request) else: log_queue.put([ "DEBUG", "Not generating a scaling operation for nsId: %s" % (ns_id) ]) return "OK", 200
def configure_ns_aiml_scale_work(nsId, nsdId, nsd_json, vnfds_json, sap_info): """ Parses the nsd to find possible aiml scale work Parameters ---------- nsId: String with the Network Service Id nsdId: string String with the kind of Ns associated to the nsId nsd_json: json Network service descriptor vnfds_json: dict Dict with json of the virtual network functions sap_info: dict information with the service access point associated to the deployed vnfs Returns ------- """ aiml_scale_dict = {} aiml_scaling = False # steps: # 1 - check that there is an scaling aiml work. Assuming, there is one: if "aimlRules" in nsd_json["nsd"].keys(): for rule in nsd_json["nsd"]["aimlRules"]: if (rule["problem"] == "scaling"): aiml_scaling = True aiml_element = rule log_queue.put(["DEBUG", "Scaling operation driven by AIML"]) break if (aiml_scaling): # 2 - create kafka topic problem = aiml_element["problem"] kafka_topic = monitoring.create_kafka_topic(nsId, problem) log_queue.put( ["DEBUG", "The created kafka_topic is: %s" % (kafka_topic)]) if (kafka_topic): # 3 - make a call to config manager to create association between monitoring # parameters and kafka topic, so Prometheus publish the info in kafka topic scrape_jobs = get_performance_metric_for_aiml_rule( nsId, aiml_element, nsd_json) log_queue.put(["DEBUG", "Scraper jobs: "]) log_queue.put(["DEBUG", json.dumps(scrape_jobs, indent=4)]) scrapes_dict = {} collectionPeriod = 1 # we will choose the biggest one, between those used for scrape_job in scrape_jobs: scraper = monitoring.create_prometheus_scraper( nsId, kafka_topic, scrape_job['vnf'], scrape_job['metric'], scrape_job['expression'], scrape_job['collectionPeriod']) scrapes_dict.update({scraper['scraperId']: scraper}) if (scrape_job["collectionPeriod"] > collectionPeriod): collectionPeriod = scrape_job["collectionPeriod"] # 4 - download the model and the streaming class, save the files in the spark_folder # 4.1 - for the streaming class (jar file), we need a common folder and rename the file as class+kafka_topic, but for the model, # 4.2 - we will create a new folder in the spark folder, called like the kafka_topic, for the moment static # streaming_class = "5growth_polito_2.11-0.1.jar" # model_name = "spark-random-forest-model" streaming_class = "5growth_vCDN_2.11-0.1.jar" model_name = "spark-random-forest-model-vCDN" status_file = spark_folder + "/" + kafka_topic + ".txt" log_queue.put(["DEBUG", "Status file: %s" % status_file]) # 5 - start the spark job # spark_job_id = start_spark_streaming_job(nsId, kafka_topic, streaming_class, model_name) spark_job_id = start_spark_streaming_job(nsId, kafka_topic, streaming_class, model_name, collectionPeriod, \ kafka_ip + ":" + kafka_port, alert_target, status_file) if (spark_job_id == None): log_queue.put([ "DEBUG", "Failure in the creation of the spark streaming job" ]) return log_queue.put( ["DEBUG", "The created spark_job_id is: %s" % (spark_job_id)]) # 6 - publish the currentIL in kafka topic currentIL = ns_db.get_ns_il(nsId) current_IL = [{ "type_message": "nsStatusMetrics", "metric": { "__name__": "nsInstantiationLevel", "nsId": nsId, }, "value": currentIL }] monitoring.publish_json_kafka(kafka_topic, current_IL) # 7.1 - create the element to be saved in the database aiml_scale_dict["topicId"] = kafka_topic aiml_scale_dict["streamingClass"] = streaming_class aiml_scale_dict["model"] = model_name aiml_scale_dict["streamingJobId"] = spark_job_id aiml_scale_dict["collectionPeriod"] = collectionPeriod # identifiers returned in step 3 aiml_scale_dict["scrapperJobs"] = scrapes_dict # 7.2 - save the info in ns_db. Since there maybe other aiml job, we save this info as another element # save the list of alerts in the database ns_db.set_aiml_info(nsId, "scaling", aiml_scale_dict)