def LoadRewardsDict(): sFileRewards = config.get_string('REWARDS_FILE'); dConnRewardsIndexes = collections.defaultdict(lambda:{'iNumNeg':0, 'iNumPos':0}); iMaxIter = config.get_int('REWARDS_MAX_ITER'); iMinIter = config.get_int('REWARDS_MIN_ITER'); iCurIter = 0; for sLine in open(sFileRewards): sLine = sLine.strip(); lConns = sLine.split(); if len(lConns) > 5: iCurIter += 1; if iCurIter > iMaxIter: break; if iCurIter < iMinIter: continue; for sConn in lConns: iFrom, iTo, iNumPos, iNumNeg = sConn.split(':'); iNumPos = int(iNumPos); iNumNeg = int(iNumNeg); iFrom = int(iFrom); iTo = int(iTo); dConnRewardsIndexes[(iFrom,iTo)]['iNumNeg'] += iNumNeg; dConnRewardsIndexes[(iFrom,iTo)]['iNumPos'] += iNumPos; dIndexToPredList = predicate.PredDictFileToPredListDict(config.get_string('PRED_DICT_FILE'), lambda x:x.iIndex); dConnRewardsStrings = collections.defaultdict(lambda:{'iNumNeg':0, 'iNumPos':0}); for (iFrom, iTo), dPosNeg in dConnRewardsIndexes.items(): sFrom = dIndexToPredList[iFrom][0].GetObject(); sTo = dIndexToPredList[iTo][0].GetObject(); dConnRewardsStrings[(sFrom, sTo)]['iNumNeg'] += dPosNeg['iNumNeg']; dConnRewardsStrings[(sFrom, sTo)]['iNumPos'] += dPosNeg['iNumPos']; return dConnRewardsStrings;
class Config(object): """Gatling runner package configuration.""" # Download directory for getting gatling packages DOWNLOAD_DIRECTORY = os.path.join(os.path.dirname(__file__), "download") # Gatling package repository default settings GATLING_REPO_URL = os.environ.get("GATLING_REPO_URL") GATLING_REPO_NAME = os.environ.get("GATLING_REPO_NAME") GATLING_REPO_VERSION = os.environ.get("GATLING_REPO_VERSION") # Gatling ssh connector default settings GATLING_SSH_PORT = os.environ.get("GATLING_SSH_PORT") GATLING_SSH_USERNAME = os.environ.get("GATLING_SSH_USERNAME") GATLING_SSH_HOST = os.environ.get("GATLING_SSH_HOST") GATLING_SSH_KEY_PATH = os.environ.get("GATLING_SSH_KEY_PATH", config.jumpbox_key_path) # Gatling tests connection proxy settings GATLING_PROXY = os.environ.get("GATLING_PROXY") GATLING_PROXY_HTTP_PORT = config.get_int("GATLING_PROXY_HTTP_PORT") GATLING_PROXY_HTTPS_PORT = config.get_int("GATLING_PROXY_HTTPS_PORT") # How long (in seconds) to wait before make next gatling results check TIME_BEFORE_NEXT_TRY = 300 # How many times try to get gatling logs when log file size not change NUMBER_OF_TRIALS_WITHOUT_LOG_CHANGE = 2 GATLING_PACKAGE_FILE_NAME = "{}-{}.jar".format(GATLING_REPO_NAME, GATLING_REPO_VERSION) GATLING_PACKAGE_FILE_URL = "{}{}/{}/{}".format(GATLING_REPO_URL, GATLING_REPO_NAME, GATLING_REPO_VERSION, GATLING_PACKAGE_FILE_NAME) GATLING_PACKAGE_FILE_PATH = os.path.join(DOWNLOAD_DIRECTORY, GATLING_PACKAGE_FILE_NAME)
def LoadAllFullRewardsFiles(): sRewardsDir = config.get_string('FULL_REWARDS_LOG_DIR'); lRewards = []; print "Loading 200 Files From:", sRewardsDir; for iFileNum in range(config.get_int('REWARDS_MIN_ITER'), config.get_int('REWARDS_MAX_ITER')): sFile = sRewardsDir + '/predictions.log.' + str(iFileNum); lRewards.extend(LoadSingleFullRewardsFile(sFile)); print "Done Loading"; print "NumFF:", Reward.ComputeNumFfsFromList(lRewards); sys.exit(-1); return lRewards;
def CalcDisToTerminals(self, iIndex, iLen): iWindowSize = config.get_int('FEATURES:WINDOW_SIZE'); iLeftDis = iIndex; iRightDis = iLen - iIndex - 1; if iLeftDis > iWindowSize: iLeftDis = iWindowSize + 1; if iRightDis > iWindowSize: iRightDis = iWindowSize + 1; return iLeftDis, iRightDis;
def get_proxy_settings(config=None, conn=None): r""" Return proxy settings as a ProxySettings object The caller must specify either config or conn. Arguments: - `config`: A osdlyrics.config.Config object, this object is used to retrive proxy settings. If it is not set, the caller MUST set conn to a valid D-Bus connection to create a Config object - `conn`: A D-Bus connection object, this is used when `config` is not specified. """ if config is None and conn is None: raise ValueError('Either config or conn must be specified') if config is None: config = config.Config(conn) proxy_type = config.get_string('Download/proxy') if proxy_type.lower() == 'no': return ProxySettings(protocol='no') if proxy_type.lower() == 'manual': protocol = config.get_string('Download/proxy-type') host = config.get_string('Download/proxy-host') port = config.get_int('Download/proxy-port') username = config.get_string('Download/proxy-username') passwd = config.get_string('Download/proxy-passwd') return ProxySettings(protocol=protocol, host=host, port=port, username=username, password=passwd) if proxy_type.lower() == 'system': return detect_system_proxy()
def debug_inject_fault(self, instance_id, targetgroup, default, nolog=False): instance = self.ec2.get_instance_by_id(instance_id) if instance is None or instance["State"]["Name"] != "running": return default if Cfg.get_int("ec2.az.evict_instances_when_az_faulty"): # If AWS indicate issues with some AZ, we assume instances located in them are 'unavail' if instance["Placement"]["AvailabilityZone"] in self.ec2.get_azs_with_issues(): return "unavail" directives = Cfg.get("targetgroup.debug.inject_fault_status").split(",") for directive in directives: if directive == "": continue criteria, fault = directive.split(":") c = criteria.split("&") criteria = c[0] # Check if a targetgroup name constraint is set if len(c) > 1: if targetgroup is not None and self.get_short_targetgroup_name(targetgroup) not in c: continue instance_id = instance["InstanceId"] if criteria == instance_id or criteria == instance["Placement"]["AvailabilityZone"]: if not nolog: log.warning("Injecting targetgroup fault '%s/%s' for instance '%s'!" % (targetgroup if targetgroup is not None else "All targetgroups", fault, instance_id)) return fault return default
class WusstestDuSchon(commands.Cog): def __init__(self, bot): self.bot = bot @routines.routine(minutes=config.get_int("WusstestDuSchonLoop")) async def loop(self): if await self.bot.stream(): channel = self.bot.channel() prefix = config.get_value("WusstestDuSchonPrefix") message = self.get_random_message(prefix) await self.bot.send_me(channel, message) @staticmethod def get_random_message(prefix): conn = sqlite3.connect("db.sqlite3") c = conn.cursor() c.execute( 'SELECT text, use_prefix from haugebot_web_wusstestduschon where active is true' ) wusstestduschon = random.choice(c.fetchall()) conn.close() if wusstestduschon[1] == 1: return prefix.strip() + " " + wusstestduschon[0].strip() else: return wusstestduschon[0] def change_interval(self, minutes): pass
def Evaluate(lSamples): bCollapseFirst = config.get_bool('COLLAPSE_FIRST'); lFScores = []; lPrecisions = []; lRecalls = []; dFalsePosCounts = collections.defaultdict(lambda:PredData(bPos = True)); dFalseNegCounts = collections.defaultdict(lambda:PredData(bPos = False)); dTruePosCounts = collections.defaultdict(lambda:PredData(bPos = True)); dTotalCounts = collections.defaultdict(lambda:0); for iIter in range(config.get_int('NUM_ITER')): lTrain, lTest = SplitTrainTest(lSamples); if config.get_bool('SVM'): assert not config.get_bool('LOG_LINEAR'); lTest, dFeatureWeights = TrainAndTestSvm(lTrain, lTest); elif config.get_bool('LOG_LINEAR'): lTest, dFeatureWeights = log_linear.TrainAndTestFromGranular(lTrain, lTest); else: assert False; if config.get_bool('WRITE_TRUE_POS_AND_FALSE_NEG'): UpdateBadPredCounts(dFalsePosCounts, dFalseNegCounts, dTruePosCounts, dTotalCounts, dFeatureWeights, lTest); fScore, fPrec, fRecall = AnalyzePredsSimple(lTest); lFScores.append(fScore); lPrecisions.append(fPrec); lRecalls.append(fRecall); if config.get_bool('WRITE_TRUE_POS_AND_FALSE_NEG'): WriteBadPredCounts(dFalsePosCounts, dFalseNegCounts, dTruePosCounts, dTotalCounts); for fScore in lFScores: print "FScore is:", fScore; print "Average Precision: ", np.average(lPrecisions), "\tStd: ", np.std(lPrecisions); print "Average Recall: ", np.average(lRecalls), "\tStd: ", np.std(lRecalls); print "Average F-Score: ", np.average(lFScores), "\tStd: ", np.std(lFScores);
def ack_event_dates(self, event_dates): client = self.context["dynamodb.client"] table_name = self.context["EventTable"] for date in event_dates: if Cfg.get_int("notify.event.keep_acked_records"): response = client.update_item( Key={"EventDate": { 'S': date }}, UpdateExpression="set AckDate=:ackdate", ExpressionAttributeValues={ ':ackdate': { 'S': str(self.context["now"]) } }, ConditionExpression="attribute_exists(EventDate)", ReturnConsumedCapacity='TOTAL', TableName=table_name, ) else: response = client.delete_item(Key={'EventDate': { 'S': date }}, TableName=table_name) log.debug(Dbg.pprint(response))
def GetPredPos(self, bIgnoreDir = False): if config.get_bool('SVM'): fThres = config.get_int('SVM_THRESHOLD'); elif config.get_bool('LOG_LINEAR'): fThres = 0.5 else: assert False; return (self.GetPred(bIgnoreDir) > fThres);
def GenAllGranularSamplesFromList(lSentences, sLogFileName): sSentenceLogFile = config.get_string('SENTENCE_LOG_FILE'); fLog = open(sSentenceLogFile, 'w'); lSamples = []; iNumLoopy = 0; for sentence in lSentences: iCurNumLoopy, lCurSamples = sentence.GenAllGranularSamples(fLog); lSamples.extend(lCurSamples); iNumLoopy += iCurNumLoopy; if iNumLoopy > 0: print "NUM LOOPY:", iNumLoopy; assert iNumLoopy < config.get_int('NUM_ALLOWED_LOOPY'), 'Too Many Loopy: ' + str(iNumLoopy) + ' NonLoopy: ' + str(len(lSamples)); sGoldDepFile = config.get_string('GOLD_DEP_FILE'); if sGoldDepFile != '': dGoldDeps = data.file_to_obj_with_comments(sGoldDepFile); # add the gold dep info for sample in lSamples: if (sample.pddlconn.sPddlTo in dGoldDeps) and (sample.pddlconn.sPddlFrom in dGoldDeps[sample.pddlconn.sPddlTo]): sample.bGoldPos = True; sPredDictFile = config.get_string('PRED_DICT_FILE'); if sPredDictFile != '': lPredicates = predicate.PredDictFileToPredList(sPredDictFile); dObjToPredList = collections.defaultdict(lambda:[]); for predCur in lPredicates: dObjToPredList[predCur.GetObject()].append(predCur); for sample in lSamples: sample.pddlconn.lFromPreds = dObjToPredList[sample.pddlconn.sPddlFrom]; sample.pddlconn.lToPreds = dObjToPredList[sample.pddlconn.sPddlTo]; else: assert False; #prune the unecessary features dFeatureCounts = collections.defaultdict(lambda:0); for sample in lSamples: for iFeature in sample.features.GetFeatureIndexList(): dFeatureCounts[iFeature] += 1; iMinFeatureCount = config.get_int('MIN_FEATURE_OCCURANCE_COUNT'); for sample in lSamples: for iFeature in sample.features.GetFeatureIndexList(): if dFeatureCounts[iFeature] < iMinFeatureCount: sample.features.RemoveFeature(iFeature); return lSamples;
def route_message(message, peer): if message[b"ttl"] > config.get_int("maxttl"): return np = tracerList.select_peer(message[b"to"]) if np: message[b"ttl"] += 1 np.sent += 1 np.send_packet(message)
def GetPredPos(self, bIgnoreDir = False): if bIgnoreDir: return self.bPredPos or self.pddlconn.pddlconnReverse.sample.bPredPos; if config.get_bool('SVM'): fThres = config.get_int('SVM_THRESHOLD'); elif config.get_bool('LOG_LINEAR'): fThres = 0.5 else: assert False; return (self.fPred > fThres);
def send_events(self, instance_ids, event_class, event_name, event_args, pretty_event_name=None, notification_handler=None): if not Cfg.get_int("ssm.enable"): return False now = self.context["now"] default_struct = { "EventName": None, "InstanceIdSuccesses": [], "InstanceIdsNotified": [] } event_desc = self.o_state.get_state_json(f"ssm.events.class.{event_class}", default=default_struct, TTL=self.ttl) if event_name != event_desc["EventName"]: event_desc["EventName"] = event_name event_desc["InstanceIdSuccesses"] = [] event_desc["InstanceIdsNotified"] = [] # Notify users if event_name is not None and notification_handler is not None: not_notified_instance_ids = [i for i in instance_ids if i not in event_desc["InstanceIdsNotified"]] if len(not_notified_instance_ids): R(None, notification_handler, InstanceIds=not_notified_instance_ids, EventClass=event_class, EventName=event_name, EventArgs=event_args) event_desc["InstanceIdsNotified"].extend(not_notified_instance_ids) # Send SSM events to instances if event_name is None: event_desc = default_struct elif Cfg.get_int("ssm.feature.events.ec2.maintenance_window_period"): ev_ids = [i for i in instance_ids if i not in event_desc["InstanceIdSuccesses"]] if len(ev_ids): log.log(log.NOTICE, f"Send event {event_class}: {event_name}({event_args}) to {ev_ids}") if pretty_event_name is None: pretty_event_name = "SendEvent" comment = f"CS-{pretty_event_name} (%s)" % self.context["GroupName"] r = self.run_command(ev_ids, event_name, args=event_args, comment=comment) for i in [i for i in ev_ids if i in r]: if r[i]["Status"] == "SUCCESS": # Keep track that we received a SUCCESS for this instance id to not resend it again later event_desc["InstanceIdSuccesses"].append(i) self.o_state.set_state_json(f"ssm.events.class.{event_class}", event_desc, TTL=self.ttl)
def is_instance_state(self, instance_id, state): i = next( filter(lambda i: i["InstanceId"] == instance_id, self.instance_statuses), None) if Cfg.get_int("ec2.az.evict_instances_when_az_faulty" ) and "az_evicted" in state: az = self.get_instance_by_id( instance_id)["Placement"]["AvailabilityZone"] if az in self.get_azs_with_issues(): return True return i["InstanceStatus"][ "Status"] in state if i is not None else False
def __init__(self, population: model.Population): self.pop = population # Accompanying chart of current cases and total deaths chart_width = config.get_int("Chart", "Width") chart_height = config.get_int("Chart", "Height") self.chart = bar_chart.Chart(chart_width, chart_height, config.get_int("Chart", "Cols"), v_min=0, v_max=config.get_int("Chart", "Max"), title="Current cases, cumulative deaths") # Move the chart out from under the main model view self.chart.win.master.geometry(f"{chart_width}x{chart_height}-5+0") # # Summary stats self.max_symptomatic = 0 self.max_period_dead = 0 self.prior_day_dead = 0 self.prior_period_dead = 0 self.max_symptomatic_day = 0 self.max_deaths_day = 0
def configure_dashboard(self): client = self.context["cloudwatch.client"] # Cloudwatch service is billing calls to dashboard API. We make sure that we do not call it too often now = self.context["now"] dashboard_state = Cfg.get_int("cloudwatch.dashboard.use_default") dashboard_last_state = self.ec2.get_state( "cloudwatch.dashboard.use_default.last_state") self.ec2.set_state("cloudwatch.dashboard.use_default.last_state", dashboard_state, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) last_dashboad_action = self.ec2.get_state_date( "cloudwatch.dashboard.last_action", default=misc.epoch()) dashboad_update_interval = Cfg.get_duration_secs( "cloudwatch.dashboard.update_interval") if (str(dashboard_state) == dashboard_last_state ) and (now - last_dashboad_action ).total_seconds() < dashboad_update_interval: log.debug("Not yet the time to manage the dashboard.") return if Cfg.get_int("cloudwatch.dashboard.use_default") != 1: try: client.delete_dashboards( DashboardNames=[self._get_dashboard_name()]) except: pass else: content = self.load_dashboard() log.log( log.NOTICE, "Configuring CloudWatch dashboard '%s'..." % self._get_dashboard_name()) response = client.put_dashboard( DashboardName=self._get_dashboard_name(), DashboardBody=content) self.ec2.set_state("cloudwatch.dashboard.last_action", now, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
def get_prerequisites(self): if "transfer" not in self.context["o_state"].get_resource_services(): return self.resources = self.o_state.get_resources(service="transfer") self.servers = [] transfer_client = self.context["transfer.client"] paginator = transfer_client.get_paginator('list_servers') tag_mappings = itertools.chain.from_iterable( page['Servers'] for page in paginator.paginate()) self.servers = list(tag_mappings) #self.state_table = self.o_state.get_state_table() #self.state_table.register_aggregates([ # { # "Prefix": "transferfamily.", # "Compress": True, # "DefaultTTL": Cfg.get_duration_secs("transferfamily.state.default_ttl"), # "Exclude" : [] # } # ]) metric_time_resolution = Cfg.get_int( "transferfamily.metrics.time_resolution") if metric_time_resolution < 60: metric_time_resolution = 1 # Switch to highest resolution self.cloudwatch.register_metric([ { "MetricName": "Subfleet.TransferFamily.Size", "Unit": "Count", "StorageResolution": metric_time_resolution }, { "MetricName": "Subfleet.TransferFamily.RunningServers", "Unit": "Count", "StorageResolution": metric_time_resolution }, ]) # We need to register dynamically subfleet configuration keys to avoid a 'key unknown' warning # when the user is going to set it subfleet_names = self.get_subfleet_names() for subfleet in subfleet_names: key = "subfleet.%s.state" % subfleet if not Cfg.is_builtin_key_exist(key): Cfg.register({key: ""}) log.log(log.NOTICE, "Detected TransferFamily subfleets '%s'." % subfleet_names)
def get_dashboard_images(self): dashboard = json.loads(self.load_dashboard()) # Get graph properties graph_metrics = list( filter(lambda g: g["type"] == "metric", dashboard["widgets"])) properties = [g["properties"] for g in graph_metrics] client = self.context["cloudwatch.client"] r = {} for p in properties: title = p["title"] p["width"] = Cfg.get_int("cloudwatch.dashboard.snapshot_width") p["height"] = Cfg.get_int("cloudwatch.dashboard.snapshot_height") try: response = client.get_metric_widget_image( MetricWidget=json.dumps(p)) r[title] = str(base64.b64encode(response["MetricWidgetImage"]), "utf-8") except Exception as e: log.exception( "Failed to retrieve CloudWatch graph image for '%s'! : % e" % (title, e)) return r
def read_all_sqs_messages(): messages = [] sqs_client = ctx["sqs.client"] while True: response = sqs_client.receive_message( QueueUrl=ctx["MainSQSQueue"], AttributeNames=['All'], MaxNumberOfMessages=10, VisibilityTimeout=Cfg.get_int("app.run_period"), WaitTimeSeconds=0) if "Messages" in response: messages.extend(response["Messages"]) else: break return messages
def route_random(message, peer): if len(peers.peers) < 2: return if message[b"ttl"] > config.get_int("maxttl"): return np = peer while np == peer: np = random.choice(peers.peers) message[b"ttl"] += 1 np.send_packet(message)
def tracer_task(): while True: while len(peers.peers) < 1: time.sleep(5) peer = random.choice(peers.peers) peer.sent += 1 msg = { "type": "tracer", "from": config.get("id"), "to": "SYSTEM", "ttl": 0, "ts": seconds_ns(), } peer.send_packet(msg) time.sleep(config.get_int("tracer_interval"))
def manage_rule_event(self, event): if Cfg.get_int("cron.disable"): return if "source" in event and event["source"] == "aws.events" and event[ "detail-type"] == "Scheduled Event": # Triggered by an AWS CloudWatch Scheduled event. We look for a ParameterSet # request based on the ARN misc.initialize_clients(["events"], self.context) misc.load_prerequisites(self.context, ["o_scheduler"]) for r in event["resources"]: log.debug("Processing Scheduled event '%s'..." % r) m = re.search( "^arn:aws:events:[a-z-0-9]+:[0-9]+:rule/CS-Cron-%s-(.*)" % self.context["GroupName"], r) if m is not None and len(m.groups()) == 1: rule_num = m.group(1) log.info("Got event rule '%s'" % rule_num) self.load_event_definitions() rule_def = self.get_ruledef_by_name( "CS-Cron-%s-%s" % (self.context["GroupName"], rule_num)) log.debug(rule_def) ttl = None try: ttl = misc.str2duration_seconds( rule_def["TTL"] ) if rule_def is not None and "TTL" in rule_def else None except Exception as e: log.exception( "[WARNING] Failed to read 'TTL' value '%s'!" % (TTL)) params = dict(rule_def["Data"][0]) for k in params: if k in ["TTL", "schedule"]: continue Cfg.set(k, params[k], ttl=ttl) return True return False
def prepare_ssm(self): if not Cfg.get_int("ssm.enable"): return now = self.context["now"] client = self.context["ssm.client"] # Update instance inventory log.debug("describe_instance_information()") paginator = client.get_paginator('describe_instance_information') response_iterator = paginator.paginate( Filters=[ { 'Key': 'tag:clonesquad:group-name', 'Values': [self.context["GroupName"]] }, ], MaxResults=50) instance_infos = [] for r in response_iterator: instance_infos.extend([d for d in r["InstanceInformationList"]]) self.instance_infos = instance_infos log.debug("end - describe_instance_information()")
def listener(): addr = config.get("addr") port = config.get_int("port", 0) sock.bind((addr, port)) while True: try: # Clean temporary peers peers.clean_temp() packet, peer_addr = sock.recvfrom(2048) message = bencode.decode(packet) if config.get_bool("dumpraw"): print(packet) key = config.get("network-key") if key: assert NetworkKey.check_signature(message) peer = peers.find_by_addr(peer_addr) if not peer: if config.get_bool("temp_peer"): peer = peers.create_temp(peer_addr) else: continue if config.get_bool("dump"): print(message) #print(message, peer_addr, peer.alias) handle(message, peer) peer.last_received = time.time() except Exception as e: pass
def interact_handler_entrypoint(event, context): """ Parameters ---------- event: dict, required context: object, required Lambda Context runtime methods and attributes Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html Returns ------ """ global ctx ctx["now"] = misc.utc_now() ctx["FunctionName"] = "Interact" log.info("Processing start (version=%s)" % (ctx.get("CloneSquadVersion"))) init() notify.do_not_notify = True # We do not want notification and event management in the Interact function #log.info(json.dumps(event)) if ctx["LoggingS3Path"] != "" and Cfg.get_int("app.archive_interact_events"): s3path = "%s/InteractEvents/%s.json" % (ctx["LoggingS3Path"], ctx["now"]) log.warning("Pushing Interact event in '%s'!" % s3path) misc.put_s3_object(s3path, Dbg.pprint(event)) response = {} if ctx["o_interact"].handler(event, context, response): log.debug("API Gateway response: %s" % response) sqs.process_sqs_records(ctx, event) return response
def publish_all_reports(ctx, url, reportname, now=None): global recurse_protection if recurse_protection: return recurse_protection = True try: log.info("Generating debug report in memory...") report = debug_report_generator(ctx) log.info("Publishing to S3 (clear text)...") debug_report_publish(ctx, report, url, reportname=reportname, now=now) log.info("Obfuscating...") if Cfg.get_int("notify.debug.obfuscate_s3_reports"): report = debug_report_obfuscate(ctx, report) log.info("Publishing to S3 (obfuscated)...") debug_report_publish(ctx, report, url, reportname="%s_OBFUSCATED" % reportname, now=now) except Exception as e: log.exception("[ERROR] Failed to send debug report to S3 (%s) : %s" % (url, e)) log.info("Uploaded debug report to S3...") recurse_protection = False
def get_prerequisites(self): if Cfg.get_int("cron.disable"): return # Get Timezone related info self.timezones = yaml.safe_load( misc.get_url("internal:region-timezones.yaml")) self.tz = os.getenv("TimeZone") self.tz = self.timezones.get(self.context["AWS_DEFAULT_REGION"]) if ( self.tz is None or self.tz == "") else self.tz self.tz = self.tz if self.tz else "UTC" self.local_now = arrow.now( self.tz) # Get local time (with local timezone) self.utc_offset = self.local_now.utcoffset() self.dst_offset = self.local_now.dst() log.log( log.NOTICE, "Current timezone offset to UTC: %s, DST: %s, TimeZone: %s" % (self.utc_offset, self.dst_offset, self.tz)) # Load scheduler KV table self.scheduler_table = kvtable.KVTable.create( self.context, self.context["SchedulerTable"], cache_max_age=Cfg.get_duration_secs("scheduler.cache.max_age")) # Compute event names self.load_event_definitions() # Read all existing event rules client = self.context["events.client"] params = { "NamePrefix": "CS-Cron-%s-" % (self.context["GroupName"]), "Limit": 10 } self.rules = [] paginator = client.get_paginator('list_rules') response_iterator = paginator.paginate(**params) for response in response_iterator: if "Rules" in response: self.rules.extend(response["Rules"]) max_rules_per_batch = Cfg.get_int("cron.max_rules_per_batch") # Create missing rules expected_rule_names = [r["Name"] for r in self.event_names] existing_rule_names = [r["Name"] for r in self.rules] for r in expected_rule_names: if r not in existing_rule_names: max_rules_per_batch -= 1 if max_rules_per_batch <= 0: break rule_def = self.get_ruledef_by_name(r) schedule_spec = rule_def["Data"][0]["schedule"] schedule_expression = self.process_cron_expression( schedule_spec) log.log( log.NOTICE, f"Creating {r} {schedule_spec} => {schedule_expression}..." ) # In order to remove burden on user, we perform a sanity check about a wellknown # limitation of Cloudwatch. if schedule_expression.startswith("cron("): expr = [ i for i in schedule_expression.replace("(", " ").replace( ")", " ").split(" ") if i != "" ] if len(expr) != 7: log.warn( "Schedule rule '%s' has an invalid cron expression '%s' (too short cron syntax)! Ignore it..." % (rule_def["EventName"], schedule_expression)) continue if (expr[5] != '?' and not expr[3] == '?') or ( expr[3] != '?' and not expr[5] == '?'): log.warn( "Schedule rule '%s' has an invalid cron expression '%s'. " "You can't specify the Day-of-month and Day-of-week fields in the same cron expression. If you specify a value (or a *) in one of the fields, you must use a ? (question mark) in the other. " "" % (rule_def["EventName"], schedule_expression)) continue # Update Cloudwatch rule try: response = client.put_rule( Name=r, Description="Schedule Event '%s': %s" % (rule_def["EventName"], rule_def["Event"]), RoleArn=self.context["CloudWatchEventRoleArn"], ScheduleExpression=schedule_expression, State='ENABLED') log.debug("put_rule: %s" % response) except Exception as e: log.exception( "Failed to create scheduler event '%s' (%s) : %s" % (r, schedule_expression, e)) try: response = client.put_targets( Rule=r, Targets=[{ 'Arn': self.context["InteractLambdaArn"], 'Id': "id%s" % r, }]) log.debug("put_targets: %s" % response) except Exception as e: log.exception( "Failed to set targets for event rule '%s' : %s" % (r, e)) # Garbage collect obsolete rules for r in existing_rule_names: if r not in expected_rule_names: max_rules_per_batch -= 1 if max_rules_per_batch <= 0: break try: client.remove_targets(Rule=r, Ids=["id%s" % r]) client.delete_rule(Name=r) except Exception as e: log.exception("Failed to delete rule '%s' : %s" % (r, e))
def AnalyzePredsSimple(lSamples): if config.get_bool('FORCE_SINGLE_DIR'): dSamples = {}; for sample in lSamples: tKey = (sample.pddlconn.sPddlFrom, sample.pddlconn.sPddlTo); assert(tKey not in dSamples); dSamples[tKey] = sample; iNumTotal = 0; iNumCorrect = 0; iTruePos = 0; iFalsePos = 0; iTrueNeg = 0; iFalseNeg = 0; iThres = 0; if config.get_bool('SVM'): fThres = config.get_int('SVM_THRESHOLD'); elif config.get_bool('LOG_LINEAR'): fThres = 0.5 else: assert False; if config.get_bool('CALC_FSCORE_ON_GOLD'): setGoldStringConns = LoadGoldStringConnSet() iNumGold = len(setGoldStringConns); if config.get_bool('ANALYZE_ON_HARD'): lEasy = data.file_to_obj(config.get_string('EASY_CONNECTIONS_LIST_FILE')); fPredMin = sys.float_info.max; fPredMax = -sys.float_info.max; for sample in lSamples: if config.get_bool('ANALYZE_ON_HARD'): if sample.pddlconn.sPddlTo in lEasy: continue; if config.get_bool('TRAIN_ON_REWARD_EVAL_ON_GOLD'): bActual = sample.GetGoldPos(bIgnoreDir = config.get_bool('IGNORE_DIR_FOR_EVAL')); else: bActual = sample.GetPos(bIgnoreDir = config.get_bool('IGNORE_DIR_FOR_EVAL')); if config.get_bool('FORCE_SINGLE_DIR'): fPred = sample.fPred; tReverseKey = (sample.pddlconn.sPddlTo, sample.pddlconn.sPddlFrom); fReversePred = dSamples[tReverseKey].fPred if tReverseKey in dSamples else -sys.maxint; bNormalPred = (float(sample.fPred) > fThres); bPred = ((float(sample.fPred) > fThres) and (float(fPred) >= float(fReversePred))); if tReverseKey not in dSamples: print "FORCE-MISSING"; elif (bNormalPred == bActual) and (bPred != bActual): print "FORCE-BAD:", sample.pddlconn.sPddlFrom, sample.pddlconn.sPddlTo, fPred, fReversePred; elif (bNormalPred != bActual) and (bPred == bActual): print "FORCE-GOOD:", sample.pddlconn.sPddlFrom, sample.pddlconn.sPddlTo, fPred, fReversePred; else: print "FORCE-NEITHER:", sample.pddlconn.sPddlFrom, sample.pddlconn.sPddlTo, fPred, fReversePred; else: bPred = sample.GetPredPos(bIgnoreDir = config.get_bool('IGNORE_DIR_FOR_EVAL')); fPredMin = min(fPredMin, sample.fPred); fPredMax = max(fPredMax, sample.fPred); iNumTotal += 1; if bPred == bActual: iNumCorrect += 1; if bPred: if bActual: iTruePos += 1; else: iFalsePos += 1; else: if bActual: iFalseNeg += 1; else: iTrueNeg += 1; if config.get_bool('CALC_FSCORE_ON_GOLD'): iFalseNeg = iNumGold - iTruePos; if config.get_bool('ANALYZE_ON_HARD'): iFalseNeg = iNumGold - iTruePos - len(lEasy); fPrecision = float(iTruePos)/float(iTruePos+iFalsePos) if iTruePos > 0 else 0; fRecall = float(iTruePos)/float(iTruePos+iFalseNeg) if iTruePos > 0 else 0; fScore = 2*fPrecision*fRecall/(fPrecision+fRecall) if (fPrecision*fRecall) > 0 else 0; print "FPred: min:", fPredMin, "max:", fPredMax; print "FScore:", fScore, fPrecision, fRecall; print "Frac Correct:", float(iNumCorrect)/float(iNumTotal), iNumCorrect, iNumTotal; print "TP:", iTruePos, "FP:", iFalsePos, "TN:", iTrueNeg, "FN:", iFalseNeg; print "FracPos:", float(iTruePos+iFalsePos)/float(iTrueNeg+iFalseNeg+iTruePos+iFalsePos); return fScore, fPrecision, fRecall;
def configure_alarms(self): """ Configure Cloudwatch Alarms for each instance. The algorithm needs to manage missing alarm as well updating existing alarms """ now = self.context["now"] client = self.context["cloudwatch.client"] valid_alarms = [] nb_of_updated_alarms = 0 max_update_per_batch = Cfg.get_int( "cloudwatch.metrics.max_update_per_batch") log.log( log.NOTICE, "Found following Alarm definition key(s) in configuration: %s" % [d for d in self.alarm_definitions]) # Step 1) Create or Update CloudWatch Alarms for running instances for instance in self.ec2.get_instances( State="pending,running", ScalingState="-error,draining,excluded"): instance_id = instance["InstanceId"] age_secs = (now - instance["LaunchTime"]).total_seconds() min_instance_age = Cfg.get_duration_secs( "cloudwatch.alarms.min_instance_age") if age_secs < min_instance_age: log.log( log.NOTICE, "Instance '%s' too young. Wait %d seconds before to set an alarm..." % (instance_id, min_instance_age - age_secs)) continue #Update alarms for this instance for alarm_definition in self.alarm_definitions: # First, check if an alarm already exists alarm_name = self._get_alarm_name(self.context["GroupName"], instance["InstanceId"], int(alarm_definition)) existing_alarms = list( filter(lambda x: x['AlarmName'] == alarm_name, self.alarms)) # Load alarm template try: if "Content" not in self.alarm_definitions[ alarm_definition]: continue kwargs = self.context.copy() kwargs["InstanceId"] = instance_id alarm_template = self.alarm_definitions[alarm_definition][ "Content"].format(**kwargs) alarm = yaml.safe_load(alarm_template) except Exception as e: log.exception( "[ERROR] Failed to read YAML alarm file '%s' : %s" % (alarm_template, e)) continue alarm["AlarmName"] = alarm_name valid_alarms.append(alarm_name) #Check if an alarm already exist existing_alarm = None if len(existing_alarms) > 0: existing_alarm = existing_alarms[0] # Check if alarm definition will be the same a = {**existing_alarm, **alarm} # 2020/07/20: CloudWatch Alarm API does not return Tags. Have to deal with # while comparing the configurations. if "Tags" in a and "Tags" not in existing_alarm: del a["Tags"] if a == existing_alarm: #log.debug("Not updating alarm '%s' as configuration is already ok" % alarm_name) continue # Check if we updated this alarm very recently delta = datetime.now( timezone.utc ) - existing_alarm["AlarmConfigurationUpdatedTimestamp"] if delta < timedelta(minutes=1): log.debug("Alarm '%s' updated to soon" % alarm_name) continue nb_of_updated_alarms += 1 if nb_of_updated_alarms > max_update_per_batch: break log.log( log.NOTICE, "Updating/creating CloudWatch Alarm '%s' : %s" % (alarm_name, alarm)) resp = client.put_metric_alarm(**alarm) log.debug(Dbg.pprint(resp)) # Step 2) Destroy CloudWatch Alarms for non existing instances (Garbage Collection) for existing_alarm in self.alarms: alarm_name = existing_alarm["AlarmName"] if not alarm_name.startswith("CloneSquad-%s-i-" % (self.context["GroupName"])): continue if alarm_name not in valid_alarms: nb_of_updated_alarms += 1 if nb_of_updated_alarms > max_update_per_batch: break log.debug("Garbage collection orphan Cloudwatch Alarm '%s'" % alarm_name) resp = client.delete_alarms(AlarmNames=[alarm_name]) log.debug(resp) nb_of_updated_alarms += 1 if nb_of_updated_alarms > max_update_per_batch: break
def __init__(self, context, ec2): self.context = context self.ec2 = ec2 self.alarms = None self.metrics = [] Cfg.register({ "cloudwatch.describe_alarms.max_results": "50", "cloudwatch.default_ttl": "days=1", "cloudwatch.alarms.max_per_instance": "6", "cloudwatch.alarms.min_instance_age": "minutes=3", "cloudwatch.configure.max_alarms_deleted_batch_size": "5", "cloudwatch.metrics.namespace": "CloneSquad", "cloudwatch.metrics.subnamespace": "", "cloudwatch.metrics.excluded,Stable": { "DefaultValue": "", "Format": "StringList", "Description": """List of metric pattern names to not send to Cloudwatch This configuration key is used to do Cost optimization by filtering which CloneSquad Metrics are sent to Cloudwatch. It support regex patterns. > Ex: StaticFleet.*;NbOfBouncedInstances """ }, "cloudwatch.metrics.data_period": "minutes=2", "cloudwatch.metrics.max_update_per_batch": "20", "cloudwatch.metrics.cache.max_retention_period": "minutes=10", "cloudwatch.metrics.minimum_polled_alarms_per_run": "1", "cloudwatch.metrics.time_for_full_metric_refresh,Stable": { "DefaultValue": "minutes=1,seconds=30", "Format": "Duration", "Description": """The total period for a complete refresh of EC2 Instance metrics This parameter is a way to reduce Cloudwatch cost induced by GetMetricData API calls. It defines indirectly how many alarm metrics will be polled in a single Main Lambda execution. A dedicated algorithm is used to extrapolate missing data based on previous GetMetricData API calls. Reducing this value increase the accuracy of the scaling criteria and so, the reactivity of CloneSquad to a sudden burst of activity load but at the expense of Cloudwatch.GetMetricData API cost. This parameter does not influence the polling of user supplied alarms that are always polled at each run. """ }, "cloudwatch.dashboard.use_default,Stable": { "DefaultValue": 1, "Format": "Bool", "Description": """Enable or disable the Cloudwatch dashboard for CloneSquad. The dashboard is enabled by default. """ }, "cloudwatch.dashboard.update_interval": "hours=1", "cloudwatch.dashboard.snapshot_width": 1000, "cloudwatch.dashboard.snapshot_height": 400 }) Cfg.register({ "cloudwatch.alarm00.configuration_url,Stable": { "DefaultValue": "", "Format": "MetaString", "Description": """Alarm specification to track for scaling decisions. Ex: internal:ec2.scaleup.alarm-cpu-gt-75pc.yaml,Points=1001,BaselineThreshold=30.0 See [Alarm specification documentation](ALARMS_REFERENCE.md) for more details. """ } }) for i in range(1, Cfg.get_int("cloudwatch.alarms.max_per_instance")): Cfg.register({ "cloudwatch.alarm%02d.configuration_url,Stable" % i: { "DefaultValue": "", "Format": "MetaString", "Description": """See `cloudwatch.alarm00.configuration_url`. """ } }) self.register_metric([{ "MetricName": "Cloudwatch.GetMetricData", "Unit": "Count", "StorageResolution": 60 }]) self.ec2.register_state_aggregates([{ "Prefix": "cloudwatch.dashboard.", "Compress": True, "DefaultTTL": Cfg.get_duration_secs("cloudwatch.default_ttl"), "Exclude": [] }])
def get_prerequisites(self): now = self.context["now"] client = self.context["cloudwatch.client"] # Read all CloudWatch alarm templates into memory alarm_definitions = {} for i in range(0, Cfg.get_int("cloudwatch.alarms.max_per_instance")): key = "cloudwatch.alarm%02d.configuration_url" % (i) r = Cfg.get_extended(key) if not r["Success"] or r["Value"] == "": continue d = misc.parse_line_as_list_of_dict(r["Value"]) url = d[0]["_"] meta = d[0] index = "%02d" % i alarm_defs = { "Index": index, "Key": key, "Url": url, "Definition": r, "Metadata": meta } prefix = "alarmname:" if url.startswith(prefix): alarm_defs["AlarmName"] = url[len(prefix):] else: log.log(log.NOTICE, "Read Alarm definition: %s" % r["Value"]) try: resp = misc.get_url(url.format(**self.context)) if resp is None: raise Exception("URL content = <None>") alarm_defs["Content"] = str(resp, "utf-8") except Exception as e: log.exception("Failed to load Alarm definition '%s' : %e" % (r["Value"], e)) continue alarm_definitions[index] = alarm_defs self.alarm_definitions = alarm_definitions # Read all existing CloudWatch alarms alarms = [] response = None while (response is None or "NextToken" in response): response = client.describe_alarms(MaxRecords=Cfg.get_int( "cloudwatch.describe_alarms.max_results"), NextToken=response["NextToken"] if response is not None else "") #log.debug(Dbg.pprint(response)) for alarm in response["MetricAlarms"]: alarm_name = alarm["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is not None: # This is an alarm thats belong to this CloneSquad instance alarms.append(alarm) #log.debug(Dbg.pprint(alarms)) self.alarms = alarms # Sanity check for index in self.alarm_definitions.keys(): alarm_def = self.alarm_definitions[index] if "AlarmName" not in alarm_def: continue alarm = next( filter(lambda a: a["AlarmName"] == alarm_def["AlarmName"], self.alarms), None) if alarm is None: log.warning( "Alarm definition [%s](%s => %s) doesn't match an existing CloudWatch alarm!" % (alarm_def["Definition"]["Key"], alarm_def["Definition"]["Value"], alarm_def["Definition"]["Status"])) # Read all metrics associated with alarms # CloudWatch intense polling can be expensive: This algorithm links CW metric polling rate to the # scale rate => Under intense scale up condition, polling is aggresive. If not, it falls down # to one polling every 'cloudwatch.metrics.low_rate_polling_interval' seconds # TODO(@jcjorel): Avoid this kind of direct references to an upper level module!! integration_period = Cfg.get_duration_secs( "ec2.schedule.horizontalscale.integration_period") instance_scale_score = self.ec2.get_integrated_float_state( "ec2.schedule.scaleout.instance_scale_score", integration_period) self.metric_cache = self.get_metric_cache() query = {"IdMapping": {}, "Queries": []} # Build query for Alarm metrics if Cfg.get("ec2.schedule.desired_instance_count") == "-1": # Sort by oldest alarms first in cache cached_metric_names = [m["_MetricId"] for m in self.metric_cache] valid_alarms = [] for a in alarms: alarm_name = a["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is None or alarm_def["AlarmDefinition"][ "Url"].startswith("alarmname:"): continue a["_SamplingTime"] = self.get_metric_by_id( alarm_name )["_SamplingTime"] if alarm_name in cached_metric_names else str( misc.epoch()) valid_alarms.append(a) sorted_alarms = sorted( valid_alarms, key=lambda a: misc.str2utc(a["_SamplingTime"])) # We poll from the oldest to the newest and depending on the instance_scale_score to limit CloudWacth GetMetricData costs time_for_full_metric_refresh = max( Cfg.get_duration_secs( "cloudwatch.metrics.time_for_full_metric_refresh"), 1) app_run_period = Cfg.get_duration_secs("app.run_period") minimum_polled_alarms_per_run = Cfg.get_int( "cloudwatch.metrics.minimum_polled_alarms_per_run") maximum_polled_alarms_per_run = app_run_period / time_for_full_metric_refresh maximum_polled_alarms_per_run = min(maximum_polled_alarms_per_run, 1.0) weight = min(instance_scale_score, maximum_polled_alarms_per_run) max_alarms_for_this_run = max( minimum_polled_alarms_per_run, int(min(weight, 1.0) * len(sorted_alarms))) for alarm in sorted_alarms[:max_alarms_for_this_run]: alarm_name = alarm["AlarmName"] CloudWatch._format_query(query, alarm_name, alarm) # We always poll user supplied alarms for alarm in alarms: alarm_name = alarm["AlarmName"] alarm_def = self.get_alarm_configuration_by_name(alarm_name) if alarm_def is None: continue # Unknown alarm name if not alarm_def["AlarmDefinition"]["Url"].startswith( "alarmname:"): continue CloudWatch._format_query(query, alarm_name, alarm) # Query Metric for Burstable instances burstable_instances = self.ec2.get_burstable_instances( ScalingState="-error") last_collect_date = self.ec2.get_state_date( "cloudwatch.metrics.last_burstable_metric_collect_date") if last_collect_date is None or (now - last_collect_date) > timedelta( minutes=1): for i in burstable_instances: instance_id = i["InstanceId"] if not self.ec2.is_static_subfleet_instance( instance_id) and self.ec2.get_scaling_state( instance_id) == "excluded": continue CloudWatch._format_query( query, "%s/%s" % ("CPUCreditBalance", instance_id), { "MetricName": "CPUCreditBalance", "Namespace": "AWS/EC2", "Dimensions": [{ "Name": "InstanceId", "Value": instance_id }], "Period": 300, "Statistic": "Average" }) self.ec2.set_state( "cloudwatch.metrics.last_burstable_metric_collect_date", now, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) # Make request to CloudWatch query_counter = self.ec2.get_state_int( "cloudwatch.metric.query_counter", default=0) queries = query["Queries"] metric_results = [] metric_ids = [] no_metric_ids = [] while len(queries) > 0: q = queries[:500] queries = queries[500:] results = [] response = None while response is None or "NextToken" in response: args = { "MetricDataQueries": q, "StartTime": now - timedelta(seconds=Cfg.get_duration_secs( "cloudwatch.metrics.data_period")), "EndTime": now } if response is not None: args["NextToken"] = response["NextToken"] response = client.get_metric_data(**args) results.extend(response["MetricDataResults"]) query_counter += len(q) for r in results: if r["StatusCode"] != "Complete": log.error("Failed to retrieve metrics: %s" % q) continue metric_id = query["IdMapping"][r["Id"]] if len(r["Timestamps"]) == 0: if metric_id not in no_metric_ids: no_metric_ids.append(metric_id) continue if metric_id not in metric_ids: metric_ids.append(metric_id) r["_MetricId"] = metric_id r["_SamplingTime"] = str(now) log.debug(r) metric_results.append(r) if len(no_metric_ids): log.info("No metrics returned for alarm '%s'" % no_metric_ids) # Merge with existing cache metric metric_cache = self.metric_cache self.metric_cache = metric_results for m in metric_cache: max_retention_period = Cfg.get_duration_secs( "cloudwatch.metrics.cache.max_retention_period") if m["_MetricId"] in metric_ids or "_SamplingTime" not in m: continue if (now - misc.str2utc(m["_SamplingTime"]) ).total_seconds() < max_retention_period: self.metric_cache.append(m) self.ec2.set_state("cloudwatch.metric.query_counter", query_counter, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) self.ec2.set_state_json( "cloudwatch.metrics.cache", self.metric_cache, TTL=Cfg.get_duration_secs("cloudwatch.default_ttl")) self.set_metric("Cloudwatch.GetMetricData", query_counter) # Augment Alarm definitions and Instances with associated metrics for metric in self.metric_cache: metric_id = metric["_MetricId"] alarm_data = self.get_alarm_data_by_name(metric_id) if alarm_data is not None: alarm_data["MetricDetails"] = metric continue instance = next( filter( lambda i: "CPUCreditBalance/%s" % i["InstanceId"] == metric_id, burstable_instances), None) if instance is not None: instance["_Metrics"] = {} instance["_Metrics"]["CPUCreditBalance"] = metric continue
def send_commands(self): if not Cfg.get_int("ssm.enable"): return client = self.context["ssm.client"] refs = { "Linux": { "document": "AWS-RunShellScript", "shell": [s.rstrip() for s in io.StringIO(str(misc.get_url("internal:cs-ssm-agent.sh"), "utf-8")).readlines()], "ids": [], } } # Purge already replied results valid_cmds = [] for cmd in self.run_cmd_states["Commands"]: if cmd.get("Complete") or cmd["Expiration"] < misc.seconds_from_epoch_utc(): continue valid_cmds.append(cmd) self.run_cmd_states["Commands"] = valid_cmds # Purge outdated former results former_results = self.run_cmd_states["FormerResults"] for i in list(former_results.keys()): for cmd in list(former_results[i].keys()): if former_results[i][cmd]["Expiration"] < misc.seconds_from_epoch_utc(): del former_results[i][cmd] if len(former_results[i].keys()) == 0: del former_results[i] # Send commands for cmd in self.commands_to_send: platforms = {} for i in cmd["InstanceIds"]: info = self.is_instance_online(i) if info is None: continue platform_type = info["PlatformType"] pltf = refs.get(platform_type) if pltf is None: log.warning("Can't run a command on an unsupported platform : %s" % info["PlatformType"]) continue # Unsupported platform if platform_type not in platforms: platforms[platform_type] = copy.deepcopy(pltf) if i not in platforms[platform_type]["ids"]: platforms[platform_type]["ids"].append(i) command = cmd["Command"] args = cmd["CommandArgs"] for p in platforms: pltf = platforms[p] instance_ids = pltf["ids"] if not len(instance_ids): continue document = pltf["document"] shell = pltf["shell"] i_ids = instance_ids # Perform string parameter substitutions in the helper script shell_input = [l.replace("##Cmd##", command) for l in shell] shell_input = [l.replace("##ApiGwUrl##", self.context["InteractAPIGWUrl"]) for l in shell_input] if isinstance(args, str): shell_input = [l.replace("##Args##", args) for l in shell_input] else: shell_input = [l.replace("##Args##", args["Args"] if "Args" in args else "") for l in shell_input] for s in args: shell_input = [l.replace(f"##{s}##", str(args[s])) for l in shell_input] while len(i_ids): log.log(log.NOTICE, f"SSM SendCommand({p}): {command}({args}) to %s." % i_ids[:50]) try: response = client.send_command( InstanceIds=i_ids[:50], DocumentName=document, TimeoutSeconds=cmd["Timeout"], Comment=cmd["Comment"], Parameters={ 'commands': shell_input, 'executionTimeout': [str(cmd["Timeout"])] }, MaxConcurrency='100%', MaxErrors='100%', CloudWatchOutputConfig={ 'CloudWatchLogGroupName': self.context["SSMLogGroup"], 'CloudWatchOutputEnabled': True } ) self.run_cmd_states["Commands"].append({ "Id": response["Command"]["CommandId"], "InstanceIds": i_ids[:50], "ReceivedInstanceIds": [], "Command": command, "CommandArgs": args, "Results": {}, "Expiration": misc.seconds_from_epoch_utc() + Cfg.get_duration_secs("ssm.state.command.default_ttl") }) log.log(log.NOTICE, f"SSM RunCommand (Id:%s) : {command}({args})" % response["Command"]["CommandId"]) except Exception as e: # Under rare circumstance, we can receive an Exception while trying to send log.log(log.NOTICE, f"Failed to do SSM SendCommand : {e}, %s" % i_ids[:50]) i_ids = i_ids[50:] self.o_state.set_state_json("ssm.events.run_commands", self.run_cmd_states, compress=True, TTL=self.ttl)
def manage_maintenance_windows(self): """ Read SSM Maintenance Window information and apply temporary configuration during maintenance period. """ config_tag = "clonesquad:config:" def _set_tag(fleet, config, mw): min_instance_count = None if "Tags" in mw: tags = {} for t in mw["Tags"]: if t["Key"].startswith(config_tag): tags[t["Key"][len(config_tag):]] = t["Value"] if fleet is None: if "ec2.schedule.min_instance_count" in tags: min_instance_count = tags["ec2.schedule.min_instance_count"] else: tag = f"subfleet.{fleet}.ec2.schedule.min_instance_count" if tag in tags: min_instance_count = tags[tag] del tags[tag] tag = f"subfleet.__all__.ec2.schedule.min_instance_count" if tag in tags: min_instance_count = tags[tag] del tags[tag] for t in tags: if not Cfg.is_builtin_key_exist(t): log.warning(f"On SSM MaintenanceWindow objection %s/%s, tag '{config_tag}.{t}' does not refer " "to an existing configuration key!!" % (mw["WindowId"], mw["Name"])) continue config[f"override:{t}"] = tags[t] return min_instance_count config = {} meta = {} is_maintenance_time = self.is_maintenance_time(meta=meta) self._record_last_maintenance_window_time(is_maintenance_time) # Send events with SSM and notify users instances = self.o_ec2.get_instances(State="pending,running", main_fleet_only=True) instance_ids = [i["InstanceId"] for i in instances] event_name = "ENTER_MAINTENANCE_WINDOW_PERIOD" if is_maintenance_time else "EXIT_MAINTENANCE_WINDOW_PERIOD" pretty_event_name = "EnterMaintenanceWindowPeriod" if is_maintenance_time else "ExitMaintenanceWindowPeriod" self.send_events(instance_ids, "maintenance_window.state_change", event_name, { }, notification_handler=self.ssm_maintenance_window_event, pretty_event_name=pretty_event_name) # Main fleet Maintenance window management if not is_maintenance_time: if "NextWindowMessage" in meta: log.log(log.NOTICE, meta["NextWindowMessage"]) else: log.log(log.NOTICE, f"Main fleet under Active Maintenance Window until %s : %s" % (meta["EndTime"], meta["MatchingWindow"])) min_instance_count = _set_tag(None, config, meta["MatchingWindow"]) if min_instance_count is None: min_instance_count = Cfg.get("ssm.feature.maintenance_window.mainfleet.ec2.schedule.min_instance_count") config["override:ec2.schedule.min_instance_count"] = min_instance_count if min_instance_count == "100%": config["override:ec2.schedule.desired_instance_count"] = "100%" # Subfleet Maintenance window management for subfleet in self.o_ec2.get_subfleet_names(): meta = {} is_maintenance_time = self.is_maintenance_time(fleet=subfleet, meta=meta) self._record_last_maintenance_window_time(is_maintenance_time, fleet=subfleet) # Send events with SSM and notify users instances = self.o_ec2.get_instances(State="running", instances=self.o_ec2.get_subfleet_instances(subfleet_name=subfleet)) instance_ids = [i["InstanceId"] for i in instances] event_name = "ENTER_MAINTENANCE_WINDOW_PERIOD" if is_maintenance_time else "EXIT_MAINTENANCE_WINDOW_PERIOD" self.send_events(instance_ids, "maintenance_window.state_change", event_name, { }, notification_handler=self.ssm_maintenance_window_event, pretty_event_name=pretty_event_name) if not is_maintenance_time: if "NextWindowMessage" in meta: log.log(log.NOTICE, meta["NextWindowMessage"]) else: log.log(log.NOTICE, f"Subflee '{subfleet}' fleet under Active Maintenance Window until %s : %s" % (meta["EndTime"], meta["MatchingWindow"])) min_instance_count = _set_tag(subfleet, config, meta["MatchingWindow"]) if min_instance_count is None: min_instance_count = Cfg.get(f"ssm.feature.maintenance_window.subfleet.{subfleet}.ec2.schedule.min_instance_count") config[f"override:subfleet.{subfleet}.ec2.schedule.min_instance_count"] = min_instance_count if min_instance_count == "100%": config[f"override:subfleet.{subfleet}.ec2.schedule.desired_instance_count"] = "100%" if Cfg.get_int("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"): config[f"override:subfleet.{subfleet}.state"] = "running" # Register SSM Maintenance Window configuration override Cfg.register(config, layer="SSM Maintenance window override", create_layer_when_needed=True)
def main(): """View a simulation of contagion""" args = cli() config.configure(args.conf) nrows = config.get_int("Grid", "rows") ncols = config.get_int("Grid", "cols") population = model.Population(nrows, ncols) # View of the main model view = grid_view.GridView(config.get_int("Grid", "Width"), config.get_int("Grid", "Height"), nrows=nrows, ncols=ncols, title="Contagion", autoflush=False) # Summary statistics stats_view = contagion_stats.Stats(population) # Monitor changes to cells --- # - for monitoring progress # - for updating the main view monitor = change_listener.ChangeListener() # Attach listeners to each cell for row in range(nrows): for col in range(ncols): cell_view = grid_view.CellView(row, col, view) population.cells[row][col].add_listener(cell_view) # Graphics population.cells[row][col].add_listener(monitor) # Change tracking view.update(rate=5) # Initial view, before simulation starts view.update() time.sleep(1) log.info("Seeding") population.seed() # Note this should set change monitor view.update() time.sleep(1) # Evolve until it reaches quiescence log.info("Running") steps = 0 epoch = 0 while monitor.check(): monitor.set(False) # No changes yet in this cycle # An 'epoch' is 10 steps. We stop when an epoch has # gone by without a noticeable state change, and we # chart each epoch rather than each step for _ in range(10): steps += 1 log.debug(f"Step {steps}") population.step() view.update() stats_view.update(day=steps) time.sleep(0.1) epoch += 1 # Print stats and update bar graph after each epoch stats_view.show(day=steps, epoch=epoch) # Simulation is no longer changing. Leave view open # until the user presses enter stats_view.show_summary() i = input("Press enter to close")
def _record_call(need_shortterm_record, is_success_func, f, *args, **kwargs): global records global notify_mgr record = {} record["EventType"] = f.__name__ record["Input"] = {"*args": list(args), "**kwargs": dict(kwargs)} managed_exception = None xray_recorder.begin_subsegment("notifycall-call:%s" % f.__name__) try: r = f(*args, **kwargs) record["Output"] = json.dumps(r, default=str) except Exception as e: managed_exception = e record["Except"] = { "Exception": traceback.format_exc(), "Stackstrace": traceback.extract_stack(), "Reason": json.dumps(e, default=str) } xray_recorder.end_subsegment() if managed_exception is not None: # Persist now all aggregated data to not lose them xray_recorder.begin_subsegment("notifycall-persist_aggregates:%s" % f.__name__) try: KVTable.persist_aggregates() except Exception as e: log.exception("Failed to persist aggregated date!") xray_recorder.end_subsegment() if notify_mgr is None or do_not_notify: log.debug( "Do not write Event in event table: notify_mgr=%s, do_not_notify=%s" % (notify_mgr, do_not_notify)) if managed_exception is not None: raise managed_exception return r ctx = notify_mgr.context try: need_longterm_record = managed_exception is not None or not is_success_func( args, kwargs, r) if is_success_func is not None else False except Exception as e: log.exception( "Got an exception while assessing long term event management : %s" % e) need_longterm_record = True # Try to catch the maximum available metadata to ease later diagnosis # Protect against exceptions to ensure proper logging record["Metadata"] = {} xray_recorder.begin_subsegment("notifycall-build_metadata:%s" % f.__name__) try: notify_mgr.ec2.get_prerequisites(only_if_not_already_done=True) record["Metadata"]["EC2"] = { "AllInstanceDetails": notify_mgr.ec2.get_instances(), "AllInstanceStatuses": notify_mgr.ec2.get_instance_statuses(), "DrainingInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="draining") ], "BouncedInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="bounced") ], "ExcludedInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="excluded") ], "ErrorInstances": [ i["InstanceId"] for i in notify_mgr.ec2.get_instances(ScalingState="error") ], "ScalingStates": notify_mgr.ec2.get_all_scaling_states() } except Exception as e: log.exception('Failed to create record["Metadata"]["EC2"] : %s' % e) xray_recorder.end_subsegment() xray_recorder.begin_subsegment("notifycall-build_metadata_targetgroup:%s" % f.__name__) try: notify_mgr.targetgroup.get_prerequisites(only_if_not_already_done=True) record["Metadata"][ "TargetGroups"] = notify_mgr.targetgroup.get_targetgroups_info() except Exception as e: log.exception( 'Failed to create record["Metadata"]["TargetGroups"] : %s' % e) xray_recorder.end_subsegment() for key in ["Metadata"]: zipped_bytes = gzip.compress( bytes(json.dumps(record[key], default=str), "utf-8")) record[key] = str(base64.b64encode(zipped_bytes), "utf-8") now = misc.utc_now() now_seconds = misc.seconds_from_epoch_utc() max_longterm_records = Cfg.get_int("notify.event.longterm.max_records") if max_longterm_records <= 0: need_longterm_record = 0 tables = [ { "Name": ctx["EventTable"], "NeedWrite": need_shortterm_record, "TTL": Cfg.get_duration_secs("notify.event.default_ttl"), "DBImages": False, "DebugReport": False }, { "Name": ctx["LongTermEventTable"], "NeedWrite": need_longterm_record, "TTL": Cfg.get_duration_secs("notify.event.longterm.ttl"), "DBImages": True, "DebugReport": True }, ] xray_recorder.begin_subsegment("notifycall-update_tables:%s" % f.__name__) for table in tables: if not table["NeedWrite"]: continue UpdateExpression = "set EventSource=:entrypoint, EventType=:eventtype, InputData=:input, OutputData=:output, HandledException=:exception, " UpdateExpression += "Metadata=:metadata, ExpirationTime=:expirationtime" ExpressionAttributeValues = { ':entrypoint': { 'S': ctx["FunctionName"] }, ':eventtype': { 'S': record["EventType"] }, ':input': { 'S': json.dumps(record["Input"], default=str) }, ':output': { 'S': json.dumps(record["Output"] if "Output" in record else {}, default=str) }, ':exception': { 'S': json.dumps(record["Except"] if "Except" in record else "", default=str) }, ':metadata': { 'S': json.dumps(record["Metadata"], default=str) }, ':expirationtime': { 'N': str(now_seconds + table["TTL"]) } } if table["DBImages"]: # Insert snapshots of the CloudWatch dashboard try: log.log(log.NOTICE, "Generating snapshots for Dashboard graphs...") images = notify_mgr.cloudwatch.get_dashboard_images() for i in images: compressed_name = i.replace(" ", "") UpdateExpression += ", Graph_%s_PNG=:graph%s" % ( compressed_name, compressed_name) ExpressionAttributeValues[":graph%s" % compressed_name] = { 'S': images[i] } log.info( "/!\ Generated CloudWatch dashboard PNG snapshots in DynamoDb table '%s' for further event analysis!" % table["Name"]) except Exception as e: log.exception( "Failed to retrieve CloudWatch snapshot images! : %s" % e) response = ctx["dynamodb.client"].update_item( Key={"EventDate": { 'S': str(now) }}, UpdateExpression=UpdateExpression, ExpressionAttributeValues=ExpressionAttributeValues, ReturnConsumedCapacity='TOTAL', TableName=table["Name"], ) log.debug(Dbg.pprint(response)) log.log( log.NOTICE, "Written event '[%s] %s' to table '%s'." % (str(now), record["EventType"], table["Name"])) # Keep under control the number of LongTerm items stored in DynamoDB table if need_longterm_record: longterm_item_eventdates = [ m["_"] for m in notify_mgr.state.get_metastring_list( "notify.longterm.itemlist", default=[]) ] log.log(log.NOTICE, "Guessed number of records in LongTerm Event table : %d", len(longterm_item_eventdates)) longterm_item_eventdates.append(str(now)) nb_records_to_delete = max( len(longterm_item_eventdates) - max_longterm_records, 0) for eventdate in longterm_item_eventdates[:nb_records_to_delete]: try: response = ctx["dynamodb.client"].delete_item( Key={'EventDate': { 'S': eventdate }}, TableName=ctx["LongTermEventTable"]) log.debug(response) log.log( log.NOTICE, "Purged LongTerm Event record '%s' as too many are already stored (notify.event.longterm.max_records=%d)" % (eventdate, max_longterm_records)) except Exception as e: log.exception( "Got exception while deleting LongTerm record '%s' : %e" % (eventdate, e)) notify_mgr.state.set_state( "notify.longterm.itemlist", ";".join(longterm_item_eventdates[nb_records_to_delete:]), TTL=Cfg.get_duration_secs("notify.event.longterm.ttl")) try: KVTable.persist_aggregates() except Exception as e: log.exception("Got exception while persisting KVTables : %s" % e) # Manage Debug report export to S3 url = ctx["LoggingS3Path"] if url != "" and table["DebugReport"] and Cfg.get_int( "notify.debug.send_s3_reports"): xray_recorder.begin_subsegment( "notifycall-publish_all_reports:%s" % f.__name__) if ctx["FunctionName"] == "Interact": # Avoid recursion if throwing from InteractFunction log.info("Publishing Debug reports synchronously...") debug.publish_all_reports(ctx, url, "notifymgr_report") else: client = ctx["sqs.client"] log.info( "Notifying Interact SQS Queue '%s' for asynchronous debug report generation..." % ctx["InteractSQSUrl"]) response = client.send_message(QueueUrl=ctx["InteractSQSUrl"], MessageBody=json.dumps({ "OpType": "Debug/PublishReportNow", "Events": { "Timestamp": str(ctx["now"]) } })) log.debug(response) xray_recorder.end_subsegment() xray_recorder.end_subsegment() if managed_exception is not None: raise managed_exception return r
def get_prerequisites(self): if Cfg.get_int("cron.disable"): return self.scheduler_table = kvtable.KVTable(self.context, self.context["SchedulerTable"]) # Compute event names self.load_event_definitions() # Read all existing event rules client = self.context["events.client"] params = { "NamePrefix": "CS-Cron-%s%s-" % (self.context["GroupName"], self.context["VariantNumber"]), "Limit": 10 } rules = [] while True: response = client.list_rules(**params) if "Rules" in response: rules.extend(response["Rules"]) if "NextToken" not in response: break params["NextToken"] = response["NextToken"] self.rules = rules max_rules_per_batch = Cfg.get_int("cron.max_rules_per_batch") # Create missing rules expected_rule_names = [r["Name"] for r in self.event_names] existing_rule_names = [r["Name"] for r in self.rules] for r in expected_rule_names: if r not in existing_rule_names: max_rules_per_batch -= 1 if max_rules_per_batch <= 0: break rule_def = self.get_ruledef_by_name(r) try: response = client.put_rule( Name=r, Description="Schedule Event '%s': %s" % (rule_def["EventName"], rule_def["Event"]), RoleArn=self.context["CloudWatchEventRoleArn"], ScheduleExpression=rule_def["Data"][0]["schedule"], State='ENABLED') log.debug("put_rule: %s" % response) except Exception as e: log.exception( "Failed to create scheduler event '%s' (%s) : %s" % (r, rule_def["Data"][0]["schedule"], e)) try: response = client.put_targets( Rule=r, Targets=[{ 'Arn': self.context["InteractLambdaArn"], 'Id': "id%s" % r, }]) log.debug("put_targets: %s" % response) except Exception as e: log.exception( "Failed to set targets for event rule '%s' : %s" % (r, e)) # Garbage collect obsolete rules for r in existing_rule_names: if r not in expected_rule_names: max_rules_per_batch -= 1 if max_rules_per_batch <= 0: break try: client.remove_targets(Rule=r, Ids=["id%s" % r]) client.delete_rule(Name=r) except Exception as e: log.exception("Failed to delete rule '%s' : %s" % (r, e))
def get_prerequisites(self, only_if_not_already_done=False): if only_if_not_already_done and self.prereqs_done: return self.state_table = self.o_state.get_state_table() client = self.context["ec2.client"] # Retrieve list of instances with appropriate tag Filters = [{ 'Name': 'tag:clonesquad:group-name', 'Values': [self.context["GroupName"]] }] instances = [] response = None while (response is None or "NextToken" in response): response = client.describe_instances( Filters=Filters, MaxResults=Cfg.get_int("ec2.describe_instances.max_results"), NextToken=response["NextToken"] if response is not None else "") for reservation in response["Reservations"]: instances.extend(reservation["Instances"]) # Filter out instances with inappropriate state non_terminated_instances = [] for i in instances: if i["State"]["Name"] not in ["shutting-down", "terminated"]: non_terminated_instances.append(i) self.instances = non_terminated_instances self.instance_ids = [i["InstanceId"] for i in self.instances] # Enrich describe_instances output with instance type details if Cfg.get_int("ec2.describe_instance_types.enabled"): self.instance_types = [] [ self.instance_types.append(i["InstanceType"]) for i in self.instances if i["InstanceType"] not in self.instance_types ] if len(self.instance_types): response = client.describe_instance_types( InstanceTypes=self.instance_types) self.instance_type_details = response["InstanceTypes"] for i in self.instances: i["_InstanceType"] = next( filter( lambda it: it["InstanceType"] == i["InstanceType"], self.instance_type_details), None) # Get instances status instance_statuses = [] response = None while response is None or "NextToken" in response: q = {"InstanceIds": self.instance_ids} if response is not None and "NextToken" in response: q["NextToken"] = response["NextToken"] response = client.describe_instance_status(**q) instance_statuses.extend(response["InstanceStatuses"]) self.instance_statuses = instance_statuses # Get AZ status response = client.describe_availability_zones() self.availability_zones = response["AvailabilityZones"] if len(self.availability_zones) == 0: raise Exception("Can't have a region with no AZ...") self.az_with_issues = [] if not Cfg.get_int("ec2.az.statusmgt.disable"): for az in self.availability_zones: if az["State"] in ["impaired", "unavailable"]: self.az_with_issues.append(az) if az["State"] != "available": log.warning( "AZ %s(%s) is marked with status '%s' by EC2.describe_availability_zones() API!" % (zone_name, zone_id, zone_state)) else: log.warning( "Automatic AZ issues detection through describe_availability_zones() is DISABLED (ec2.az.statusmgt.disable != 0)..." ) # Use these config keys to simulate an AWS Large Scale Event all_az_names = [az["ZoneName"] for az in self.availability_zones] all_az_ids = [az["ZoneId"] for az in self.availability_zones] [ log.warning( "ec2.debug.availability_zones_impaired do not match local AZs! '%s'" % a) for a in Cfg.get_list("ec2.debug.availability_zones_impaired", default=[]) if a not in all_az_names and a not in all_az_ids ] [ log.warning( "ec2.az.unavailable_list do not match local AZs! '%s'" % a) for a in Cfg.get_list("ec2.az.unavailable_list", default=[]) if a not in all_az_names and a not in all_az_ids ] for az in self.availability_zones: zone_name = az["ZoneName"] zone_id = az["ZoneId"] zone_state = az["State"] if zone_name in Cfg.get_list( "ec2.debug.availability_zones_impaired", default=[]): zone_state = "impaired" if zone_id in Cfg.get_list("ec2.debug.availability_zones_impaired", default=[]): zone_state = "impaired" if zone_name in Cfg.get_list("ec2.az.unavailable_list", default=[]): zone_state = "unavailable" if zone_id in Cfg.get_list("ec2.az.unavailable_list", default=[]): zone_state = "unavailable" if zone_state != az["State"] and zone_state in [ "impaired", "unavailable" ] and az not in self.az_with_issues: self.az_with_issues.append(az) az["State"] = zone_state if zone_state != "available": log.warning( "AZ %s(%s) is marked with status '%s' by configuration keys!" % (zone_name, zone_id, zone_state)) # We need to register dynamically static subfleet configuration keys to avoid a 'key unknown' warning # when the user is going to set it static_subfleet_names = self.get_static_subfleet_names() for static_fleet in static_subfleet_names: key = "staticfleet.%s.state" % static_fleet if not Cfg.is_builtin_key_exist(key): Cfg.register({key: ""}) log.log( log.NOTICE, "Detected following static subfleet names across EC2 resources: %s" % static_subfleet_names) self.prereqs_done = True
def main_handler_entrypoint(event, context): """ Parameters ---------- event: dict, required context: object, required Lambda Context runtime methods and attributes Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html Returns ------ """ #print(Dbg.pprint(event)) ctx["now"] = misc.utc_now() ctx["FunctionName"] = "Main" init() if Cfg.get_int("app.disable") != 0 and not misc.is_sam_local(): log.warning("Application disabled due to 'app.disable' key") return no_is_called_too_early = False # Manage Spot interruption as fast as we can if sqs.process_sqs_records(event, function=ec2_schedule.manage_spot_notification, function_arg=ctx): log.info("Managed Spot Interruption SQS record!") # Force to run now disregarding `app.run_period` as we have at least one Spot instance to # remove from target groups immediatly no_is_called_too_early = True # Check that we are not called too early # Note: We peform a direct read to the KVTable to spare initialization time when the # Lambda is called too early ctx["main.last_call_date"] = ctx["o_ec2"].get_state("main.last_call_date", direct=True) if ctx["main.last_call_date"] is None or ctx["main.last_call_date"] == "": ctx["main.last_call_date"] = str(misc.epoch()) if not no_is_called_too_early and is_called_too_early(): log.log(log.NOTICE, "Called too early by: %s" % event) notify.do_not_notify = True sqs.process_sqs_records(event) sqs.call_me_back_send() return log.debug("Load prerequisites.") load_prerequisites(["o_state", "o_notify", "o_ec2", "o_cloudwatch", "o_targetgroup", "o_ec2_schedule", "o_scheduler", "o_rds"]) # Remember 'now' as the last execution date ctx["o_ec2"].set_state("main.last_call_date", value=ctx["now"], TTL=Cfg.get_duration_secs("app.default_ttl")) Cfg.dump() # Perform actions: log.debug("Main processing.") ctx["o_targetgroup"].manage_targetgroup() ctx["o_ec2_schedule"].schedule_instances() ctx["o_ec2_schedule"].stop_drained_instances() ctx["o_cloudwatch"].configure_alarms() ctx["o_rds"].manage_subfleet_rds() ctx["o_ec2_schedule"].prepare_metrics() ctx["o_cloudwatch"].send_metrics() ctx["o_cloudwatch"].configure_dashboard() # If we got woke up by SNS, acknowledge the message(s) now sqs.process_sqs_records(event) ctx["o_notify"].notify_user_arn_resources() # Call me back if needed sqs.call_me_back_send()