Ejemplo n.º 1
0
def LoadRewardsDict():
    sFileRewards = config.get_string('REWARDS_FILE');
    dConnRewardsIndexes = collections.defaultdict(lambda:{'iNumNeg':0, 'iNumPos':0});
    iMaxIter = config.get_int('REWARDS_MAX_ITER');
    iMinIter = config.get_int('REWARDS_MIN_ITER');
    iCurIter = 0;
    for sLine in open(sFileRewards):
        sLine = sLine.strip();
        lConns = sLine.split();
        if len(lConns) > 5:
            iCurIter += 1;
            if iCurIter > iMaxIter:
                break;
            if iCurIter < iMinIter:
                continue;
        for sConn in lConns:
            iFrom, iTo, iNumPos, iNumNeg = sConn.split(':');
            iNumPos = int(iNumPos);
            iNumNeg = int(iNumNeg);
            iFrom = int(iFrom);
            iTo = int(iTo);
            dConnRewardsIndexes[(iFrom,iTo)]['iNumNeg'] += iNumNeg; 
            dConnRewardsIndexes[(iFrom,iTo)]['iNumPos'] += iNumPos; 
    dIndexToPredList = predicate.PredDictFileToPredListDict(config.get_string('PRED_DICT_FILE'), lambda x:x.iIndex);
    dConnRewardsStrings = collections.defaultdict(lambda:{'iNumNeg':0, 'iNumPos':0});
    for (iFrom, iTo), dPosNeg in dConnRewardsIndexes.items():
        sFrom = dIndexToPredList[iFrom][0].GetObject();
        sTo = dIndexToPredList[iTo][0].GetObject();
        dConnRewardsStrings[(sFrom, sTo)]['iNumNeg'] += dPosNeg['iNumNeg'];
        dConnRewardsStrings[(sFrom, sTo)]['iNumPos'] += dPosNeg['iNumPos'];
    return dConnRewardsStrings;
Ejemplo n.º 2
0
class Config(object):
    """Gatling runner package configuration."""

    # Download directory for getting gatling packages
    DOWNLOAD_DIRECTORY = os.path.join(os.path.dirname(__file__), "download")

    # Gatling package repository default settings
    GATLING_REPO_URL = os.environ.get("GATLING_REPO_URL")
    GATLING_REPO_NAME = os.environ.get("GATLING_REPO_NAME")
    GATLING_REPO_VERSION = os.environ.get("GATLING_REPO_VERSION")

    # Gatling ssh connector default settings
    GATLING_SSH_PORT = os.environ.get("GATLING_SSH_PORT")
    GATLING_SSH_USERNAME = os.environ.get("GATLING_SSH_USERNAME")
    GATLING_SSH_HOST = os.environ.get("GATLING_SSH_HOST")
    GATLING_SSH_KEY_PATH = os.environ.get("GATLING_SSH_KEY_PATH", config.jumpbox_key_path)

    # Gatling tests connection proxy settings
    GATLING_PROXY = os.environ.get("GATLING_PROXY")
    GATLING_PROXY_HTTP_PORT = config.get_int("GATLING_PROXY_HTTP_PORT")
    GATLING_PROXY_HTTPS_PORT = config.get_int("GATLING_PROXY_HTTPS_PORT")

    # How long (in seconds) to wait before make next gatling results check
    TIME_BEFORE_NEXT_TRY = 300

    # How many times try to get gatling logs when log file size not change
    NUMBER_OF_TRIALS_WITHOUT_LOG_CHANGE = 2

    GATLING_PACKAGE_FILE_NAME = "{}-{}.jar".format(GATLING_REPO_NAME, GATLING_REPO_VERSION)
    GATLING_PACKAGE_FILE_URL = "{}{}/{}/{}".format(GATLING_REPO_URL, GATLING_REPO_NAME, GATLING_REPO_VERSION,
                                                   GATLING_PACKAGE_FILE_NAME)
    GATLING_PACKAGE_FILE_PATH = os.path.join(DOWNLOAD_DIRECTORY, GATLING_PACKAGE_FILE_NAME)
Ejemplo n.º 3
0
def LoadAllFullRewardsFiles():
    sRewardsDir = config.get_string('FULL_REWARDS_LOG_DIR');
    lRewards = [];
    print "Loading 200 Files From:", sRewardsDir;
    for iFileNum in range(config.get_int('REWARDS_MIN_ITER'), config.get_int('REWARDS_MAX_ITER')):
        sFile = sRewardsDir + '/predictions.log.' + str(iFileNum);
        lRewards.extend(LoadSingleFullRewardsFile(sFile));
    print "Done Loading";
    print "NumFF:", Reward.ComputeNumFfsFromList(lRewards);
    sys.exit(-1);
    return lRewards;
 def CalcDisToTerminals(self, iIndex, iLen):
     iWindowSize = config.get_int('FEATURES:WINDOW_SIZE');
     iLeftDis = iIndex;
     iRightDis = iLen - iIndex - 1;
     if iLeftDis > iWindowSize: iLeftDis = iWindowSize + 1;
     if iRightDis > iWindowSize: iRightDis = iWindowSize + 1;
     return iLeftDis, iRightDis;
Ejemplo n.º 5
0
def get_proxy_settings(config=None, conn=None):
    r"""
    Return proxy settings as a ProxySettings object

    The caller must specify either config or conn.

    Arguments:
     - `config`: A osdlyrics.config.Config object, this object is used to retrive
                 proxy settings. If it is not set, the caller MUST set conn to a
                 valid D-Bus connection to create a Config object
     - `conn`: A D-Bus connection object, this is used when `config` is not
               specified.
    """
    if config is None and conn is None:
        raise ValueError('Either config or conn must be specified')
    if config is None:
        config = config.Config(conn)
    proxy_type = config.get_string('Download/proxy')
    if proxy_type.lower() == 'no':
        return ProxySettings(protocol='no')
    if proxy_type.lower() == 'manual':
        protocol = config.get_string('Download/proxy-type')
        host = config.get_string('Download/proxy-host')
        port = config.get_int('Download/proxy-port')
        username = config.get_string('Download/proxy-username')
        passwd = config.get_string('Download/proxy-passwd')
        return ProxySettings(protocol=protocol, host=host, port=port,
                            username=username, password=passwd)
    if proxy_type.lower() == 'system':
        return detect_system_proxy()
    def debug_inject_fault(self, instance_id, targetgroup, default, nolog=False):
        instance = self.ec2.get_instance_by_id(instance_id)
        if instance is None or instance["State"]["Name"] != "running": return default

        if Cfg.get_int("ec2.az.evict_instances_when_az_faulty"):
            # If AWS indicate issues with some AZ, we assume instances located in them are 'unavail'
            if instance["Placement"]["AvailabilityZone"] in self.ec2.get_azs_with_issues(): 
                return "unavail"

        directives = Cfg.get("targetgroup.debug.inject_fault_status").split(",")

        for directive in directives:
            if directive == "": continue

            criteria, fault = directive.split(":")
            c = criteria.split("&")
            criteria = c[0]
            # Check if a targetgroup name constraint is set
            if len(c) > 1:
                if targetgroup is not None and self.get_short_targetgroup_name(targetgroup) not in c:
                    continue
            instance_id = instance["InstanceId"]
            if criteria == instance_id or criteria == instance["Placement"]["AvailabilityZone"]:
                if not nolog:
                    log.warning("Injecting targetgroup fault '%s/%s' for instance '%s'!" % 
                            (targetgroup if targetgroup is not None else "All targetgroups", fault, instance_id))
                return fault
        return default
Ejemplo n.º 7
0
class WusstestDuSchon(commands.Cog):
    def __init__(self, bot):
        self.bot = bot

    @routines.routine(minutes=config.get_int("WusstestDuSchonLoop"))
    async def loop(self):
        if await self.bot.stream():
            channel = self.bot.channel()
            prefix = config.get_value("WusstestDuSchonPrefix")
            message = self.get_random_message(prefix)
            await self.bot.send_me(channel, message)

    @staticmethod
    def get_random_message(prefix):
        conn = sqlite3.connect("db.sqlite3")

        c = conn.cursor()
        c.execute(
            'SELECT text, use_prefix from haugebot_web_wusstestduschon where active is true'
        )
        wusstestduschon = random.choice(c.fetchall())
        conn.close()

        if wusstestduschon[1] == 1:
            return prefix.strip() + " " + wusstestduschon[0].strip()
        else:
            return wusstestduschon[0]

    def change_interval(self, minutes):
        pass
def Evaluate(lSamples):
    bCollapseFirst = config.get_bool('COLLAPSE_FIRST');
    lFScores = [];
    lPrecisions = [];
    lRecalls = [];
    dFalsePosCounts = collections.defaultdict(lambda:PredData(bPos = True));
    dFalseNegCounts = collections.defaultdict(lambda:PredData(bPos = False));
    dTruePosCounts = collections.defaultdict(lambda:PredData(bPos = True));
    dTotalCounts = collections.defaultdict(lambda:0);
    for iIter in range(config.get_int('NUM_ITER')):
        lTrain, lTest = SplitTrainTest(lSamples);
        if config.get_bool('SVM'):
            assert not config.get_bool('LOG_LINEAR');
            lTest, dFeatureWeights = TrainAndTestSvm(lTrain, lTest);
        elif config.get_bool('LOG_LINEAR'):
            lTest, dFeatureWeights = log_linear.TrainAndTestFromGranular(lTrain, lTest);
        else:
            assert False;

        if config.get_bool('WRITE_TRUE_POS_AND_FALSE_NEG'):
            UpdateBadPredCounts(dFalsePosCounts, dFalseNegCounts, dTruePosCounts, dTotalCounts, dFeatureWeights, lTest);
        fScore, fPrec, fRecall = AnalyzePredsSimple(lTest);
        lFScores.append(fScore);
        lPrecisions.append(fPrec);
        lRecalls.append(fRecall);
    if config.get_bool('WRITE_TRUE_POS_AND_FALSE_NEG'):
        WriteBadPredCounts(dFalsePosCounts, dFalseNegCounts, dTruePosCounts, dTotalCounts);
    for fScore in lFScores:
        print "FScore is:", fScore;
    print "Average Precision: ", np.average(lPrecisions), "\tStd: ", np.std(lPrecisions);
    print "Average Recall: ", np.average(lRecalls), "\tStd: ", np.std(lRecalls);
    print "Average F-Score: ", np.average(lFScores), "\tStd: ", np.std(lFScores);
 def ack_event_dates(self, event_dates):
     client = self.context["dynamodb.client"]
     table_name = self.context["EventTable"]
     for date in event_dates:
         if Cfg.get_int("notify.event.keep_acked_records"):
             response = client.update_item(
                 Key={"EventDate": {
                     'S': date
                 }},
                 UpdateExpression="set AckDate=:ackdate",
                 ExpressionAttributeValues={
                     ':ackdate': {
                         'S': str(self.context["now"])
                     }
                 },
                 ConditionExpression="attribute_exists(EventDate)",
                 ReturnConsumedCapacity='TOTAL',
                 TableName=table_name,
             )
         else:
             response = client.delete_item(Key={'EventDate': {
                 'S': date
             }},
                                           TableName=table_name)
         log.debug(Dbg.pprint(response))
Ejemplo n.º 10
0
 def GetPredPos(self, bIgnoreDir = False):
     if config.get_bool('SVM'):
         fThres = config.get_int('SVM_THRESHOLD');
     elif config.get_bool('LOG_LINEAR'):
         fThres = 0.5
     else:
         assert False;
     return (self.GetPred(bIgnoreDir) > fThres);
def GenAllGranularSamplesFromList(lSentences, sLogFileName):
    sSentenceLogFile = config.get_string('SENTENCE_LOG_FILE');
    fLog = open(sSentenceLogFile, 'w');
    lSamples = [];
    iNumLoopy = 0;
    for sentence in lSentences:
        iCurNumLoopy, lCurSamples = sentence.GenAllGranularSamples(fLog);
        lSamples.extend(lCurSamples);
        iNumLoopy += iCurNumLoopy;

    if iNumLoopy > 0:
        print "NUM LOOPY:", iNumLoopy;
    assert iNumLoopy < config.get_int('NUM_ALLOWED_LOOPY'), 'Too Many Loopy: ' + str(iNumLoopy) + ' NonLoopy: ' + str(len(lSamples));

    sGoldDepFile = config.get_string('GOLD_DEP_FILE');
    if sGoldDepFile != '':
        dGoldDeps = data.file_to_obj_with_comments(sGoldDepFile);
        # add the gold dep info
        for sample in lSamples:
            if (sample.pddlconn.sPddlTo in dGoldDeps) and (sample.pddlconn.sPddlFrom in dGoldDeps[sample.pddlconn.sPddlTo]):
                sample.bGoldPos = True;

    sPredDictFile = config.get_string('PRED_DICT_FILE');
    if sPredDictFile != '':
        lPredicates = predicate.PredDictFileToPredList(sPredDictFile);
        dObjToPredList = collections.defaultdict(lambda:[]);
        for predCur in lPredicates:
            dObjToPredList[predCur.GetObject()].append(predCur);
        for sample in lSamples:
            sample.pddlconn.lFromPreds = dObjToPredList[sample.pddlconn.sPddlFrom];
            sample.pddlconn.lToPreds = dObjToPredList[sample.pddlconn.sPddlTo];
    else:
        assert False;
    #prune the unecessary features
    dFeatureCounts = collections.defaultdict(lambda:0);
    for sample in lSamples:
        for iFeature in sample.features.GetFeatureIndexList():
            dFeatureCounts[iFeature] += 1;
    iMinFeatureCount = config.get_int('MIN_FEATURE_OCCURANCE_COUNT');
    for sample in lSamples:
        for iFeature in sample.features.GetFeatureIndexList():
            if dFeatureCounts[iFeature] < iMinFeatureCount:
                sample.features.RemoveFeature(iFeature);
    return lSamples;
Ejemplo n.º 12
0
def route_message(message, peer):
    if message[b"ttl"] > config.get_int("maxttl"):
        return

    np = tracerList.select_peer(message[b"to"])

    if np:
        message[b"ttl"] += 1
        np.sent += 1
        np.send_packet(message)
Ejemplo n.º 13
0
    def GetPredPos(self, bIgnoreDir = False):
        if bIgnoreDir:
            return self.bPredPos or self.pddlconn.pddlconnReverse.sample.bPredPos;

        if config.get_bool('SVM'):
            fThres = config.get_int('SVM_THRESHOLD');
        elif config.get_bool('LOG_LINEAR'):
            fThres = 0.5
        else:
            assert False;
        return (self.fPred > fThres);
    def send_events(self, instance_ids, event_class, event_name, event_args, pretty_event_name=None, notification_handler=None):
        if not Cfg.get_int("ssm.enable"):
            return False

        now            = self.context["now"]
        default_struct = {
            "EventName": None,
            "InstanceIdSuccesses": [],
            "InstanceIdsNotified": []
        }
        event_desc = self.o_state.get_state_json(f"ssm.events.class.{event_class}", default=default_struct, TTL=self.ttl)
        if event_name != event_desc["EventName"]:
            event_desc["EventName"]           = event_name
            event_desc["InstanceIdSuccesses"] = []
            event_desc["InstanceIdsNotified"] = []

        # Notify users
        if event_name is not None and notification_handler is not None:
            not_notified_instance_ids = [i for i in instance_ids if i not in event_desc["InstanceIdsNotified"]]
            if len(not_notified_instance_ids):
                R(None, notification_handler, InstanceIds=not_notified_instance_ids, 
                        EventClass=event_class, EventName=event_name, EventArgs=event_args)
                event_desc["InstanceIdsNotified"].extend(not_notified_instance_ids)

        # Send SSM events to instances
        if event_name is None:
            event_desc = default_struct
        elif Cfg.get_int("ssm.feature.events.ec2.maintenance_window_period"):
            ev_ids = [i for i in instance_ids if i not in event_desc["InstanceIdSuccesses"]]
            if len(ev_ids):
                log.log(log.NOTICE, f"Send event {event_class}: {event_name}({event_args}) to {ev_ids}")
                if pretty_event_name is None: 
                    pretty_event_name = "SendEvent"
                comment  = f"CS-{pretty_event_name} (%s)" % self.context["GroupName"]
                r = self.run_command(ev_ids, event_name, args=event_args, comment=comment)
                for i in [i for i in ev_ids if i in r]:
                    if r[i]["Status"] == "SUCCESS":
                        # Keep track that we received a SUCCESS for this instance id to not resend it again later
                        event_desc["InstanceIdSuccesses"].append(i)

        self.o_state.set_state_json(f"ssm.events.class.{event_class}", event_desc, TTL=self.ttl)
Ejemplo n.º 15
0
 def is_instance_state(self, instance_id, state):
     i = next(
         filter(lambda i: i["InstanceId"] == instance_id,
                self.instance_statuses), None)
     if Cfg.get_int("ec2.az.evict_instances_when_az_faulty"
                    ) and "az_evicted" in state:
         az = self.get_instance_by_id(
             instance_id)["Placement"]["AvailabilityZone"]
         if az in self.get_azs_with_issues():
             return True
     return i["InstanceStatus"][
         "Status"] in state if i is not None else False
Ejemplo n.º 16
0
 def __init__(self, population: model.Population):
     self.pop = population
     # Accompanying chart of current cases and total deaths
     chart_width = config.get_int("Chart", "Width")
     chart_height = config.get_int("Chart", "Height")
     self.chart = bar_chart.Chart(chart_width,
                             chart_height,
                             config.get_int("Chart", "Cols"),
                             v_min=0,
                             v_max=config.get_int("Chart", "Max"),
                             title="Current cases, cumulative deaths")
     # Move the chart out from under the main model view
     self.chart.win.master.geometry(f"{chart_width}x{chart_height}-5+0")
     #
     # Summary stats
     self.max_symptomatic = 0
     self.max_period_dead = 0
     self.prior_day_dead = 0
     self.prior_period_dead = 0
     self.max_symptomatic_day = 0
     self.max_deaths_day = 0
Ejemplo n.º 17
0
    def configure_dashboard(self):
        client = self.context["cloudwatch.client"]
        # Cloudwatch service is billing calls to dashboard API. We make sure that we do not call it too often
        now = self.context["now"]
        dashboard_state = Cfg.get_int("cloudwatch.dashboard.use_default")
        dashboard_last_state = self.ec2.get_state(
            "cloudwatch.dashboard.use_default.last_state")
        self.ec2.set_state("cloudwatch.dashboard.use_default.last_state",
                           dashboard_state,
                           TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))

        last_dashboad_action = self.ec2.get_state_date(
            "cloudwatch.dashboard.last_action", default=misc.epoch())
        dashboad_update_interval = Cfg.get_duration_secs(
            "cloudwatch.dashboard.update_interval")
        if (str(dashboard_state) == dashboard_last_state
            ) and (now - last_dashboad_action
                   ).total_seconds() < dashboad_update_interval:
            log.debug("Not yet the time to manage the dashboard.")
            return

        if Cfg.get_int("cloudwatch.dashboard.use_default") != 1:
            try:
                client.delete_dashboards(
                    DashboardNames=[self._get_dashboard_name()])
            except:
                pass
        else:
            content = self.load_dashboard()
            log.log(
                log.NOTICE, "Configuring CloudWatch dashboard '%s'..." %
                self._get_dashboard_name())

            response = client.put_dashboard(
                DashboardName=self._get_dashboard_name(),
                DashboardBody=content)
        self.ec2.set_state("cloudwatch.dashboard.last_action",
                           now,
                           TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
Ejemplo n.º 18
0
    def get_prerequisites(self):
        if "transfer" not in self.context["o_state"].get_resource_services():
            return

        self.resources = self.o_state.get_resources(service="transfer")

        self.servers = []
        transfer_client = self.context["transfer.client"]
        paginator = transfer_client.get_paginator('list_servers')
        tag_mappings = itertools.chain.from_iterable(
            page['Servers'] for page in paginator.paginate())
        self.servers = list(tag_mappings)

        #self.state_table = self.o_state.get_state_table()
        #self.state_table.register_aggregates([
        #    {
        #        "Prefix": "transferfamily.",
        #        "Compress": True,
        #        "DefaultTTL": Cfg.get_duration_secs("transferfamily.state.default_ttl"),
        #        "Exclude" : []
        #    }
        #    ])

        metric_time_resolution = Cfg.get_int(
            "transferfamily.metrics.time_resolution")
        if metric_time_resolution < 60:
            metric_time_resolution = 1  # Switch to highest resolution
        self.cloudwatch.register_metric([
            {
                "MetricName": "Subfleet.TransferFamily.Size",
                "Unit": "Count",
                "StorageResolution": metric_time_resolution
            },
            {
                "MetricName": "Subfleet.TransferFamily.RunningServers",
                "Unit": "Count",
                "StorageResolution": metric_time_resolution
            },
        ])

        # We need to register dynamically subfleet configuration keys to avoid a 'key unknown' warning
        #   when the user is going to set it
        subfleet_names = self.get_subfleet_names()
        for subfleet in subfleet_names:
            key = "subfleet.%s.state" % subfleet
            if not Cfg.is_builtin_key_exist(key):
                Cfg.register({key: ""})
        log.log(log.NOTICE,
                "Detected TransferFamily subfleets '%s'." % subfleet_names)
Ejemplo n.º 19
0
    def get_dashboard_images(self):
        dashboard = json.loads(self.load_dashboard())
        # Get graph properties
        graph_metrics = list(
            filter(lambda g: g["type"] == "metric", dashboard["widgets"]))
        properties = [g["properties"] for g in graph_metrics]

        client = self.context["cloudwatch.client"]

        r = {}
        for p in properties:
            title = p["title"]
            p["width"] = Cfg.get_int("cloudwatch.dashboard.snapshot_width")
            p["height"] = Cfg.get_int("cloudwatch.dashboard.snapshot_height")
            try:
                response = client.get_metric_widget_image(
                    MetricWidget=json.dumps(p))
                r[title] = str(base64.b64encode(response["MetricWidgetImage"]),
                               "utf-8")
            except Exception as e:
                log.exception(
                    "Failed to retrieve CloudWatch graph image for '%s'! : % e"
                    % (title, e))
        return r
def read_all_sqs_messages():
    messages = []
    sqs_client = ctx["sqs.client"]
    while True:
        response = sqs_client.receive_message(
            QueueUrl=ctx["MainSQSQueue"],
            AttributeNames=['All'],
            MaxNumberOfMessages=10,
            VisibilityTimeout=Cfg.get_int("app.run_period"),
            WaitTimeSeconds=0)
        if "Messages" in response:
            messages.extend(response["Messages"])
        else:
            break
    return messages
Ejemplo n.º 21
0
def route_random(message, peer):
    if len(peers.peers) < 2:
        return

    if message[b"ttl"] > config.get_int("maxttl"):
        return

    np = peer

    while np == peer:
        np = random.choice(peers.peers)

    message[b"ttl"] += 1

    np.send_packet(message)
Ejemplo n.º 22
0
def tracer_task():
    while True:
        while len(peers.peers) < 1:
            time.sleep(5)

        peer = random.choice(peers.peers)
        peer.sent += 1

        msg = {
            "type": "tracer",
            "from": config.get("id"),
            "to": "SYSTEM",
            "ttl": 0,
            "ts": seconds_ns(),
        }
        peer.send_packet(msg)
        time.sleep(config.get_int("tracer_interval"))
    def manage_rule_event(self, event):
        if Cfg.get_int("cron.disable"):
            return
        if "source" in event and event["source"] == "aws.events" and event[
                "detail-type"] == "Scheduled Event":
            # Triggered by an AWS CloudWatch Scheduled event. We look for a ParameterSet
            #   request based on the ARN
            misc.initialize_clients(["events"], self.context)
            misc.load_prerequisites(self.context, ["o_scheduler"])
            for r in event["resources"]:
                log.debug("Processing Scheduled event '%s'..." % r)
                m = re.search(
                    "^arn:aws:events:[a-z-0-9]+:[0-9]+:rule/CS-Cron-%s-(.*)" %
                    self.context["GroupName"], r)
                if m is not None and len(m.groups()) == 1:
                    rule_num = m.group(1)
                    log.info("Got event rule '%s'" % rule_num)
                    self.load_event_definitions()
                    rule_def = self.get_ruledef_by_name(
                        "CS-Cron-%s-%s" %
                        (self.context["GroupName"], rule_num))
                    log.debug(rule_def)

                    ttl = None
                    try:
                        ttl = misc.str2duration_seconds(
                            rule_def["TTL"]
                        ) if rule_def is not None and "TTL" in rule_def else None
                    except Exception as e:
                        log.exception(
                            "[WARNING] Failed to read 'TTL' value '%s'!" %
                            (TTL))

                    params = dict(rule_def["Data"][0])
                    for k in params:
                        if k in ["TTL", "schedule"]: continue
                        Cfg.set(k, params[k], ttl=ttl)
            return True
        return False
    def prepare_ssm(self):
        if not Cfg.get_int("ssm.enable"):
            return

        now       = self.context["now"]
        client    = self.context["ssm.client"]
        # Update instance inventory
        log.debug("describe_instance_information()")
        paginator = client.get_paginator('describe_instance_information')
        response_iterator = paginator.paginate(
            Filters=[
                {
                    'Key': 'tag:clonesquad:group-name',
                    'Values': [self.context["GroupName"]]
                },
            ],
            MaxResults=50)

        instance_infos = []
        for r in response_iterator:
            instance_infos.extend([d for d in r["InstanceInformationList"]])
        self.instance_infos = instance_infos
        log.debug("end - describe_instance_information()")
Ejemplo n.º 25
0
def listener():
    addr = config.get("addr")
    port = config.get_int("port", 0)

    sock.bind((addr, port))

    while True:
        try:
            # Clean temporary peers
            peers.clean_temp()

            packet, peer_addr = sock.recvfrom(2048)
            message = bencode.decode(packet)

            if config.get_bool("dumpraw"):
                print(packet)

            key = config.get("network-key")

            if key:
                assert NetworkKey.check_signature(message)

            peer = peers.find_by_addr(peer_addr)

            if not peer:
                if config.get_bool("temp_peer"):
                    peer = peers.create_temp(peer_addr)
                else:
                    continue

            if config.get_bool("dump"):
                print(message)
            #print(message, peer_addr, peer.alias)
            handle(message, peer)
            peer.last_received = time.time()
        except Exception as e:
            pass
Ejemplo n.º 26
0
def interact_handler_entrypoint(event, context):
    """

    Parameters
    ----------
    event: dict, required

    context: object, required
        Lambda Context runtime methods and attributes

        Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html

    Returns
    ------

    """

    global ctx
    ctx["now"]           = misc.utc_now()
    ctx["FunctionName"]  = "Interact"

    log.info("Processing start (version=%s)" % (ctx.get("CloneSquadVersion")))
    init()
    notify.do_not_notify = True # We do not want notification and event management in the Interact function

    #log.info(json.dumps(event))
    if ctx["LoggingS3Path"] != "" and Cfg.get_int("app.archive_interact_events"):
        s3path = "%s/InteractEvents/%s.json" % (ctx["LoggingS3Path"], ctx["now"])
        log.warning("Pushing Interact event in '%s'!" % s3path)
        misc.put_s3_object(s3path, Dbg.pprint(event))

    response = {}
    if ctx["o_interact"].handler(event, context, response):
        log.debug("API Gateway response: %s" % response)
    sqs.process_sqs_records(ctx, event)
    return response
Ejemplo n.º 27
0
def publish_all_reports(ctx, url, reportname, now=None):
    global recurse_protection
    if recurse_protection: return
    recurse_protection = True

    try:
        log.info("Generating debug report in memory...")
        report = debug_report_generator(ctx)
        log.info("Publishing to S3 (clear text)...")
        debug_report_publish(ctx, report, url, reportname=reportname, now=now)
        log.info("Obfuscating...")
        if Cfg.get_int("notify.debug.obfuscate_s3_reports"):
            report = debug_report_obfuscate(ctx, report)
        log.info("Publishing to S3 (obfuscated)...")
        debug_report_publish(ctx,
                             report,
                             url,
                             reportname="%s_OBFUSCATED" % reportname,
                             now=now)
    except Exception as e:
        log.exception("[ERROR] Failed to send debug report to S3 (%s) : %s" %
                      (url, e))
    log.info("Uploaded debug report to S3...")
    recurse_protection = False
    def get_prerequisites(self):
        if Cfg.get_int("cron.disable"):
            return

        # Get Timezone related info
        self.timezones = yaml.safe_load(
            misc.get_url("internal:region-timezones.yaml"))
        self.tz = os.getenv("TimeZone")
        self.tz = self.timezones.get(self.context["AWS_DEFAULT_REGION"]) if (
            self.tz is None or self.tz == "") else self.tz
        self.tz = self.tz if self.tz else "UTC"
        self.local_now = arrow.now(
            self.tz)  # Get local time (with local timezone)
        self.utc_offset = self.local_now.utcoffset()
        self.dst_offset = self.local_now.dst()
        log.log(
            log.NOTICE,
            "Current timezone offset to UTC: %s, DST: %s, TimeZone: %s" %
            (self.utc_offset, self.dst_offset, self.tz))

        # Load scheduler KV table
        self.scheduler_table = kvtable.KVTable.create(
            self.context,
            self.context["SchedulerTable"],
            cache_max_age=Cfg.get_duration_secs("scheduler.cache.max_age"))

        # Compute event names
        self.load_event_definitions()

        # Read all existing event rules
        client = self.context["events.client"]
        params = {
            "NamePrefix": "CS-Cron-%s-" % (self.context["GroupName"]),
            "Limit": 10
        }
        self.rules = []
        paginator = client.get_paginator('list_rules')
        response_iterator = paginator.paginate(**params)
        for response in response_iterator:
            if "Rules" in response:
                self.rules.extend(response["Rules"])

        max_rules_per_batch = Cfg.get_int("cron.max_rules_per_batch")
        # Create missing rules
        expected_rule_names = [r["Name"] for r in self.event_names]
        existing_rule_names = [r["Name"] for r in self.rules]
        for r in expected_rule_names:
            if r not in existing_rule_names:
                max_rules_per_batch -= 1
                if max_rules_per_batch <= 0:
                    break
                rule_def = self.get_ruledef_by_name(r)
                schedule_spec = rule_def["Data"][0]["schedule"]
                schedule_expression = self.process_cron_expression(
                    schedule_spec)
                log.log(
                    log.NOTICE,
                    f"Creating {r} {schedule_spec} => {schedule_expression}..."
                )

                # In order to remove burden on user, we perform a sanity check about a wellknown
                #    limitation of Cloudwatch.
                if schedule_expression.startswith("cron("):
                    expr = [
                        i
                        for i in schedule_expression.replace("(", " ").replace(
                            ")", " ").split(" ") if i != ""
                    ]
                    if len(expr) != 7:
                        log.warn(
                            "Schedule rule '%s' has an invalid cron expression '%s' (too short cron syntax)! Ignore it..."
                            % (rule_def["EventName"], schedule_expression))
                        continue
                    if (expr[5] != '?' and not expr[3] == '?') or (
                            expr[3] != '?' and not expr[5] == '?'):
                        log.warn(
                            "Schedule rule '%s' has an invalid cron expression '%s'. "
                            "You can't specify the Day-of-month and Day-of-week fields in the same cron expression. If you specify a value (or a *) in one of the fields, you must use a ? (question mark) in the other. "
                            "" % (rule_def["EventName"], schedule_expression))
                        continue

                # Update Cloudwatch rule
                try:
                    response = client.put_rule(
                        Name=r,
                        Description="Schedule Event '%s': %s" %
                        (rule_def["EventName"], rule_def["Event"]),
                        RoleArn=self.context["CloudWatchEventRoleArn"],
                        ScheduleExpression=schedule_expression,
                        State='ENABLED')
                    log.debug("put_rule: %s" % response)
                except Exception as e:
                    log.exception(
                        "Failed to create scheduler event '%s' (%s) : %s" %
                        (r, schedule_expression, e))

                try:
                    response = client.put_targets(
                        Rule=r,
                        Targets=[{
                            'Arn': self.context["InteractLambdaArn"],
                            'Id': "id%s" % r,
                        }])
                    log.debug("put_targets: %s" % response)
                except Exception as e:
                    log.exception(
                        "Failed to set targets for event rule '%s' : %s" %
                        (r, e))

        # Garbage collect obsolete rules
        for r in existing_rule_names:
            if r not in expected_rule_names:
                max_rules_per_batch -= 1
                if max_rules_per_batch <= 0:
                    break
                try:
                    client.remove_targets(Rule=r, Ids=["id%s" % r])
                    client.delete_rule(Name=r)
                except Exception as e:
                    log.exception("Failed to delete rule '%s' : %s" % (r, e))
def AnalyzePredsSimple(lSamples):
    if config.get_bool('FORCE_SINGLE_DIR'):
        dSamples = {};
        for sample in lSamples:
            tKey = (sample.pddlconn.sPddlFrom, sample.pddlconn.sPddlTo);
            assert(tKey not in dSamples);
            dSamples[tKey] = sample;

    iNumTotal = 0;
    iNumCorrect = 0;
    iTruePos = 0;
    iFalsePos = 0;
    iTrueNeg = 0;
    iFalseNeg = 0;
    iThres = 0;
    if config.get_bool('SVM'):
        fThres = config.get_int('SVM_THRESHOLD');
    elif config.get_bool('LOG_LINEAR'):
        fThres = 0.5
    else:
        assert False;

    if config.get_bool('CALC_FSCORE_ON_GOLD'):
        setGoldStringConns = LoadGoldStringConnSet()
        iNumGold = len(setGoldStringConns);

    if config.get_bool('ANALYZE_ON_HARD'):
        lEasy = data.file_to_obj(config.get_string('EASY_CONNECTIONS_LIST_FILE'));
    fPredMin = sys.float_info.max;
    fPredMax = -sys.float_info.max;
    for sample in lSamples:
        if config.get_bool('ANALYZE_ON_HARD'):
            if sample.pddlconn.sPddlTo in lEasy:
                continue;

        if config.get_bool('TRAIN_ON_REWARD_EVAL_ON_GOLD'):
            bActual = sample.GetGoldPos(bIgnoreDir = config.get_bool('IGNORE_DIR_FOR_EVAL'));
        else:
            bActual = sample.GetPos(bIgnoreDir = config.get_bool('IGNORE_DIR_FOR_EVAL'));
        if config.get_bool('FORCE_SINGLE_DIR'):
            fPred = sample.fPred;
            tReverseKey = (sample.pddlconn.sPddlTo, sample.pddlconn.sPddlFrom);
            fReversePred = dSamples[tReverseKey].fPred if tReverseKey in dSamples else -sys.maxint;
            bNormalPred = (float(sample.fPred) > fThres);
            bPred = ((float(sample.fPred) > fThres) and (float(fPred) >= float(fReversePred)));
            if tReverseKey not in dSamples:
                print "FORCE-MISSING";
            elif (bNormalPred == bActual) and (bPred != bActual):
                print "FORCE-BAD:", sample.pddlconn.sPddlFrom, sample.pddlconn.sPddlTo, fPred, fReversePred;
            elif  (bNormalPred != bActual) and (bPred == bActual):
                print "FORCE-GOOD:", sample.pddlconn.sPddlFrom, sample.pddlconn.sPddlTo, fPred, fReversePred;
            else:
                print "FORCE-NEITHER:", sample.pddlconn.sPddlFrom, sample.pddlconn.sPddlTo, fPred, fReversePred;
        else:
            bPred = sample.GetPredPos(bIgnoreDir = config.get_bool('IGNORE_DIR_FOR_EVAL'));
        fPredMin = min(fPredMin, sample.fPred);
        fPredMax = max(fPredMax, sample.fPred);

        iNumTotal += 1;
        if bPred == bActual:
            iNumCorrect += 1;
        if bPred:
            if bActual:
                iTruePos += 1;
            else:
                iFalsePos += 1;
        else:
            if bActual:
                iFalseNeg += 1;
            else:
                iTrueNeg += 1;

    if config.get_bool('CALC_FSCORE_ON_GOLD'):
        iFalseNeg = iNumGold - iTruePos;
        if config.get_bool('ANALYZE_ON_HARD'):
            iFalseNeg = iNumGold - iTruePos - len(lEasy);

    fPrecision = float(iTruePos)/float(iTruePos+iFalsePos) if iTruePos > 0 else 0;
    fRecall = float(iTruePos)/float(iTruePos+iFalseNeg) if iTruePos > 0 else 0;
    fScore = 2*fPrecision*fRecall/(fPrecision+fRecall) if (fPrecision*fRecall) > 0 else 0;
    print "FPred: min:", fPredMin, "max:", fPredMax;
    print "FScore:", fScore, fPrecision, fRecall;
    print "Frac Correct:", float(iNumCorrect)/float(iNumTotal), iNumCorrect, iNumTotal;
    print "TP:", iTruePos, "FP:", iFalsePos, "TN:", iTrueNeg, "FN:", iFalseNeg;
    print "FracPos:", float(iTruePos+iFalsePos)/float(iTrueNeg+iFalseNeg+iTruePos+iFalsePos);
    return fScore, fPrecision, fRecall;
Ejemplo n.º 30
0
    def configure_alarms(self):
        """ Configure Cloudwatch Alarms for each instance.

            The algorithm needs to manage missing alarm as well updating existing alarms
        """
        now = self.context["now"]
        client = self.context["cloudwatch.client"]

        valid_alarms = []
        nb_of_updated_alarms = 0
        max_update_per_batch = Cfg.get_int(
            "cloudwatch.metrics.max_update_per_batch")

        log.log(
            log.NOTICE,
            "Found following Alarm definition key(s) in configuration: %s" %
            [d for d in self.alarm_definitions])

        # Step 1) Create or Update CloudWatch Alarms for running instances
        for instance in self.ec2.get_instances(
                State="pending,running",
                ScalingState="-error,draining,excluded"):
            instance_id = instance["InstanceId"]

            age_secs = (now - instance["LaunchTime"]).total_seconds()
            min_instance_age = Cfg.get_duration_secs(
                "cloudwatch.alarms.min_instance_age")
            if age_secs < min_instance_age:
                log.log(
                    log.NOTICE,
                    "Instance '%s' too young. Wait %d seconds before to set an alarm..."
                    % (instance_id, min_instance_age - age_secs))
                continue

            #Update alarms for this instance
            for alarm_definition in self.alarm_definitions:
                # First, check if an alarm already exists
                alarm_name = self._get_alarm_name(self.context["GroupName"],
                                                  instance["InstanceId"],
                                                  int(alarm_definition))
                existing_alarms = list(
                    filter(lambda x: x['AlarmName'] == alarm_name,
                           self.alarms))

                # Load alarm template
                try:
                    if "Content" not in self.alarm_definitions[
                            alarm_definition]:
                        continue
                    kwargs = self.context.copy()
                    kwargs["InstanceId"] = instance_id
                    alarm_template = self.alarm_definitions[alarm_definition][
                        "Content"].format(**kwargs)
                    alarm = yaml.safe_load(alarm_template)
                except Exception as e:
                    log.exception(
                        "[ERROR] Failed to read YAML alarm file '%s' : %s" %
                        (alarm_template, e))
                    continue
                alarm["AlarmName"] = alarm_name

                valid_alarms.append(alarm_name)

                #Check if an alarm already exist
                existing_alarm = None
                if len(existing_alarms) > 0:
                    existing_alarm = existing_alarms[0]

                    # Check if alarm definition will be the same
                    a = {**existing_alarm, **alarm}
                    # 2020/07/20: CloudWatch Alarm API does not return Tags. Have to deal with
                    #  while comparing the configurations.
                    if "Tags" in a and "Tags" not in existing_alarm:
                        del a["Tags"]
                    if a == existing_alarm:
                        #log.debug("Not updating alarm '%s' as configuration is already ok" % alarm_name)
                        continue

                    # Check if we updated this alarm very recently
                    delta = datetime.now(
                        timezone.utc
                    ) - existing_alarm["AlarmConfigurationUpdatedTimestamp"]
                    if delta < timedelta(minutes=1):
                        log.debug("Alarm '%s' updated to soon" % alarm_name)
                        continue

                nb_of_updated_alarms += 1
                if nb_of_updated_alarms > max_update_per_batch: break

                log.log(
                    log.NOTICE,
                    "Updating/creating CloudWatch Alarm '%s' : %s" %
                    (alarm_name, alarm))
                resp = client.put_metric_alarm(**alarm)
                log.debug(Dbg.pprint(resp))

        # Step 2) Destroy CloudWatch Alarms for non existing instances (Garbage Collection)
        for existing_alarm in self.alarms:
            alarm_name = existing_alarm["AlarmName"]
            if not alarm_name.startswith("CloneSquad-%s-i-" %
                                         (self.context["GroupName"])):
                continue
            if alarm_name not in valid_alarms:
                nb_of_updated_alarms += 1
                if nb_of_updated_alarms > max_update_per_batch: break
                log.debug("Garbage collection orphan Cloudwatch Alarm '%s'" %
                          alarm_name)
                resp = client.delete_alarms(AlarmNames=[alarm_name])
                log.debug(resp)
                nb_of_updated_alarms += 1
                if nb_of_updated_alarms > max_update_per_batch: break
Ejemplo n.º 31
0
    def __init__(self, context, ec2):
        self.context = context
        self.ec2 = ec2
        self.alarms = None
        self.metrics = []
        Cfg.register({
            "cloudwatch.describe_alarms.max_results": "50",
            "cloudwatch.default_ttl": "days=1",
            "cloudwatch.alarms.max_per_instance": "6",
            "cloudwatch.alarms.min_instance_age": "minutes=3",
            "cloudwatch.configure.max_alarms_deleted_batch_size": "5",
            "cloudwatch.metrics.namespace": "CloneSquad",
            "cloudwatch.metrics.subnamespace": "",
            "cloudwatch.metrics.excluded,Stable": {
                "DefaultValue":
                "",
                "Format":
                "StringList",
                "Description":
                """List of metric pattern names to not send to Cloudwatch

This configuration key is used to do Cost optimization by filtering which CloneSquad Metrics are sent to Cloudwatch.
It support regex patterns.

> Ex: StaticFleet.*;NbOfBouncedInstances

                        """
            },
            "cloudwatch.metrics.data_period": "minutes=2",
            "cloudwatch.metrics.max_update_per_batch": "20",
            "cloudwatch.metrics.cache.max_retention_period": "minutes=10",
            "cloudwatch.metrics.minimum_polled_alarms_per_run": "1",
            "cloudwatch.metrics.time_for_full_metric_refresh,Stable": {
                "DefaultValue":
                "minutes=1,seconds=30",
                "Format":
                "Duration",
                "Description":
                """The total period for a complete refresh of EC2 Instance metrics

This parameter is a way to reduce Cloudwatch cost induced by GetMetricData API calls. It defines indirectly how many alarm metrics
will be polled in a single Main Lambda execution. A dedicated algorithm is used to extrapolate missing data based
on previous GetMetricData API calls.

Reducing this value increase the accuracy of the scaling criteria and so, the reactivity of CloneSquad to a sudden burst of activity load but at the
expense of Cloudwatch.GetMetricData API cost.

This parameter does not influence the polling of user supplied alarms that are always polled at each run.
                        """
            },
            "cloudwatch.dashboard.use_default,Stable": {
                "DefaultValue":
                1,
                "Format":
                "Bool",
                "Description":
                """Enable or disable the Cloudwatch dashboard for CloneSquad.

The dashboard is enabled by default.
                        """
            },
            "cloudwatch.dashboard.update_interval": "hours=1",
            "cloudwatch.dashboard.snapshot_width": 1000,
            "cloudwatch.dashboard.snapshot_height": 400
        })
        Cfg.register({
            "cloudwatch.alarm00.configuration_url,Stable": {
                "DefaultValue":
                "",
                "Format":
                "MetaString",
                "Description":
                """Alarm specification to track for scaling decisions.

    Ex: internal:ec2.scaleup.alarm-cpu-gt-75pc.yaml,Points=1001,BaselineThreshold=30.0

See [Alarm specification documentation](ALARMS_REFERENCE.md)  for more details.
            """
            }
        })
        for i in range(1, Cfg.get_int("cloudwatch.alarms.max_per_instance")):
            Cfg.register({
                "cloudwatch.alarm%02d.configuration_url,Stable" % i: {
                    "DefaultValue":
                    "",
                    "Format":
                    "MetaString",
                    "Description":
                    """See `cloudwatch.alarm00.configuration_url`.
                """
                }
            })
        self.register_metric([{
            "MetricName": "Cloudwatch.GetMetricData",
            "Unit": "Count",
            "StorageResolution": 60
        }])

        self.ec2.register_state_aggregates([{
            "Prefix":
            "cloudwatch.dashboard.",
            "Compress":
            True,
            "DefaultTTL":
            Cfg.get_duration_secs("cloudwatch.default_ttl"),
            "Exclude": []
        }])
Ejemplo n.º 32
0
    def get_prerequisites(self):
        now = self.context["now"]
        client = self.context["cloudwatch.client"]

        # Read all CloudWatch alarm templates into memory
        alarm_definitions = {}
        for i in range(0, Cfg.get_int("cloudwatch.alarms.max_per_instance")):
            key = "cloudwatch.alarm%02d.configuration_url" % (i)
            r = Cfg.get_extended(key)
            if not r["Success"] or r["Value"] == "":
                continue

            d = misc.parse_line_as_list_of_dict(r["Value"])
            url = d[0]["_"]
            meta = d[0]

            index = "%02d" % i
            alarm_defs = {
                "Index": index,
                "Key": key,
                "Url": url,
                "Definition": r,
                "Metadata": meta
            }

            prefix = "alarmname:"
            if url.startswith(prefix):
                alarm_defs["AlarmName"] = url[len(prefix):]
            else:
                log.log(log.NOTICE, "Read Alarm definition: %s" % r["Value"])
                try:
                    resp = misc.get_url(url.format(**self.context))
                    if resp is None:
                        raise Exception("URL content = <None>")
                    alarm_defs["Content"] = str(resp, "utf-8")
                except Exception as e:
                    log.exception("Failed to load Alarm definition '%s' : %e" %
                                  (r["Value"], e))
                    continue
            alarm_definitions[index] = alarm_defs

        self.alarm_definitions = alarm_definitions

        # Read all existing CloudWatch alarms
        alarms = []
        response = None
        while (response is None or "NextToken" in response):
            response = client.describe_alarms(MaxRecords=Cfg.get_int(
                "cloudwatch.describe_alarms.max_results"),
                                              NextToken=response["NextToken"]
                                              if response is not None else "")
            #log.debug(Dbg.pprint(response))
            for alarm in response["MetricAlarms"]:
                alarm_name = alarm["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is not None:
                    # This is an alarm thats belong to this CloneSquad instance
                    alarms.append(alarm)
        #log.debug(Dbg.pprint(alarms))
        self.alarms = alarms

        # Sanity check
        for index in self.alarm_definitions.keys():
            alarm_def = self.alarm_definitions[index]
            if "AlarmName" not in alarm_def:
                continue
            alarm = next(
                filter(lambda a: a["AlarmName"] == alarm_def["AlarmName"],
                       self.alarms), None)
            if alarm is None:
                log.warning(
                    "Alarm definition [%s](%s => %s) doesn't match an existing CloudWatch alarm!"
                    % (alarm_def["Definition"]["Key"],
                       alarm_def["Definition"]["Value"],
                       alarm_def["Definition"]["Status"]))

        # Read all metrics associated with alarms

        # CloudWatch intense polling can be expensive: This algorithm links CW metric polling rate to the
        #    scale rate => Under intense scale up condition, polling is aggresive. If not, it falls down
        #    to one polling every 'cloudwatch.metrics.low_rate_polling_interval' seconds
        # TODO(@jcjorel): Avoid this kind of direct references to an upper level module!!
        integration_period = Cfg.get_duration_secs(
            "ec2.schedule.horizontalscale.integration_period")
        instance_scale_score = self.ec2.get_integrated_float_state(
            "ec2.schedule.scaleout.instance_scale_score", integration_period)

        self.metric_cache = self.get_metric_cache()

        query = {"IdMapping": {}, "Queries": []}

        # Build query for Alarm metrics
        if Cfg.get("ec2.schedule.desired_instance_count") == "-1":
            # Sort by oldest alarms first in cache
            cached_metric_names = [m["_MetricId"] for m in self.metric_cache]
            valid_alarms = []
            for a in alarms:
                alarm_name = a["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is None or alarm_def["AlarmDefinition"][
                        "Url"].startswith("alarmname:"):
                    continue
                a["_SamplingTime"] = self.get_metric_by_id(
                    alarm_name
                )["_SamplingTime"] if alarm_name in cached_metric_names else str(
                    misc.epoch())
                valid_alarms.append(a)
            sorted_alarms = sorted(
                valid_alarms, key=lambda a: misc.str2utc(a["_SamplingTime"]))

            # We poll from the oldest to the newest and depending on the instance_scale_score to limit CloudWacth GetMetricData costs
            time_for_full_metric_refresh = max(
                Cfg.get_duration_secs(
                    "cloudwatch.metrics.time_for_full_metric_refresh"), 1)
            app_run_period = Cfg.get_duration_secs("app.run_period")
            minimum_polled_alarms_per_run = Cfg.get_int(
                "cloudwatch.metrics.minimum_polled_alarms_per_run")
            maximum_polled_alarms_per_run = app_run_period / time_for_full_metric_refresh
            maximum_polled_alarms_per_run = min(maximum_polled_alarms_per_run,
                                                1.0)
            weight = min(instance_scale_score, maximum_polled_alarms_per_run)
            max_alarms_for_this_run = max(
                minimum_polled_alarms_per_run,
                int(min(weight, 1.0) * len(sorted_alarms)))
            for alarm in sorted_alarms[:max_alarms_for_this_run]:
                alarm_name = alarm["AlarmName"]
                CloudWatch._format_query(query, alarm_name, alarm)

            # We always poll user supplied alarms
            for alarm in alarms:
                alarm_name = alarm["AlarmName"]
                alarm_def = self.get_alarm_configuration_by_name(alarm_name)
                if alarm_def is None:
                    continue  # Unknown alarm name
                if not alarm_def["AlarmDefinition"]["Url"].startswith(
                        "alarmname:"):
                    continue
                CloudWatch._format_query(query, alarm_name, alarm)

        # Query Metric for Burstable instances
        burstable_instances = self.ec2.get_burstable_instances(
            ScalingState="-error")
        last_collect_date = self.ec2.get_state_date(
            "cloudwatch.metrics.last_burstable_metric_collect_date")
        if last_collect_date is None or (now - last_collect_date) > timedelta(
                minutes=1):
            for i in burstable_instances:
                instance_id = i["InstanceId"]
                if not self.ec2.is_static_subfleet_instance(
                        instance_id) and self.ec2.get_scaling_state(
                            instance_id) == "excluded":
                    continue
                CloudWatch._format_query(
                    query, "%s/%s" % ("CPUCreditBalance", instance_id), {
                        "MetricName":
                        "CPUCreditBalance",
                        "Namespace":
                        "AWS/EC2",
                        "Dimensions": [{
                            "Name": "InstanceId",
                            "Value": instance_id
                        }],
                        "Period":
                        300,
                        "Statistic":
                        "Average"
                    })
            self.ec2.set_state(
                "cloudwatch.metrics.last_burstable_metric_collect_date",
                now,
                TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))

        # Make request to CloudWatch
        query_counter = self.ec2.get_state_int(
            "cloudwatch.metric.query_counter", default=0)
        queries = query["Queries"]
        metric_results = []
        metric_ids = []
        no_metric_ids = []
        while len(queries) > 0:
            q = queries[:500]
            queries = queries[500:]
            results = []
            response = None
            while response is None or "NextToken" in response:
                args = {
                    "MetricDataQueries":
                    q,
                    "StartTime":
                    now - timedelta(seconds=Cfg.get_duration_secs(
                        "cloudwatch.metrics.data_period")),
                    "EndTime":
                    now
                }
                if response is not None:
                    args["NextToken"] = response["NextToken"]
                response = client.get_metric_data(**args)
                results.extend(response["MetricDataResults"])
                query_counter += len(q)

            for r in results:
                if r["StatusCode"] != "Complete":
                    log.error("Failed to retrieve metrics: %s" % q)
                    continue
                metric_id = query["IdMapping"][r["Id"]]
                if len(r["Timestamps"]) == 0:
                    if metric_id not in no_metric_ids:
                        no_metric_ids.append(metric_id)
                    continue
                if metric_id not in metric_ids: metric_ids.append(metric_id)
                r["_MetricId"] = metric_id
                r["_SamplingTime"] = str(now)
                log.debug(r)
                metric_results.append(r)
        if len(no_metric_ids):
            log.info("No metrics returned for alarm '%s'" % no_metric_ids)

        # Merge with existing cache metric
        metric_cache = self.metric_cache
        self.metric_cache = metric_results
        for m in metric_cache:
            max_retention_period = Cfg.get_duration_secs(
                "cloudwatch.metrics.cache.max_retention_period")
            if m["_MetricId"] in metric_ids or "_SamplingTime" not in m:
                continue
            if (now - misc.str2utc(m["_SamplingTime"])
                ).total_seconds() < max_retention_period:
                self.metric_cache.append(m)

        self.ec2.set_state("cloudwatch.metric.query_counter",
                           query_counter,
                           TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
        self.ec2.set_state_json(
            "cloudwatch.metrics.cache",
            self.metric_cache,
            TTL=Cfg.get_duration_secs("cloudwatch.default_ttl"))
        self.set_metric("Cloudwatch.GetMetricData", query_counter)

        # Augment Alarm definitions and Instances with associated metrics
        for metric in self.metric_cache:
            metric_id = metric["_MetricId"]

            alarm_data = self.get_alarm_data_by_name(metric_id)
            if alarm_data is not None:
                alarm_data["MetricDetails"] = metric
                continue

            instance = next(
                filter(
                    lambda i: "CPUCreditBalance/%s" % i["InstanceId"] ==
                    metric_id, burstable_instances), None)
            if instance is not None:
                instance["_Metrics"] = {}
                instance["_Metrics"]["CPUCreditBalance"] = metric
                continue
    def send_commands(self):        
        if not Cfg.get_int("ssm.enable"):
            return

        client = self.context["ssm.client"]
        refs   = {
            "Linux": {
                "document": "AWS-RunShellScript",
                "shell": [s.rstrip() for s in io.StringIO(str(misc.get_url("internal:cs-ssm-agent.sh"), "utf-8")).readlines()],
                "ids": [],
            }
        }
        # Purge already replied results
        valid_cmds = []
        for cmd in self.run_cmd_states["Commands"]:
            if cmd.get("Complete") or cmd["Expiration"] < misc.seconds_from_epoch_utc():
                continue
            valid_cmds.append(cmd)
        self.run_cmd_states["Commands"] = valid_cmds
        # Purge outdated former results
        former_results  = self.run_cmd_states["FormerResults"]
        for i in list(former_results.keys()):
            for cmd in list(former_results[i].keys()):
                if former_results[i][cmd]["Expiration"] < misc.seconds_from_epoch_utc():
                    del former_results[i][cmd]
            if len(former_results[i].keys()) == 0:
                del former_results[i]

        # Send commands
        for cmd in self.commands_to_send:
            platforms = {}
            for i in cmd["InstanceIds"]:
                info = self.is_instance_online(i)
                if info is None:
                    continue
                platform_type = info["PlatformType"]
                pltf          = refs.get(platform_type)
                if pltf is None:
                    log.warning("Can't run a command on an unsupported platform : %s" % info["PlatformType"])
                    continue # Unsupported platform
                if platform_type not in platforms:
                    platforms[platform_type] = copy.deepcopy(pltf)
                if i not in platforms[platform_type]["ids"]:
                    platforms[platform_type]["ids"].append(i)

            command = cmd["Command"]
            args    = cmd["CommandArgs"]
            for p in platforms:
                pltf         = platforms[p]
                instance_ids = pltf["ids"]
                if not len(instance_ids):
                    continue
                document     = pltf["document"]
                shell        = pltf["shell"]
                i_ids        = instance_ids
                # Perform string parameter substitutions in the helper script
                shell_input = [l.replace("##Cmd##", command) for l in shell]
                shell_input = [l.replace("##ApiGwUrl##", self.context["InteractAPIGWUrl"]) for l in shell_input]
                if isinstance(args, str):
                    shell_input = [l.replace("##Args##", args) for l in shell_input]
                else:
                    shell_input = [l.replace("##Args##", args["Args"] if "Args" in args else "") for l in shell_input]
                    for s in args:
                        shell_input = [l.replace(f"##{s}##", str(args[s])) for l in shell_input]

                while len(i_ids):
                    log.log(log.NOTICE, f"SSM SendCommand({p}): {command}({args}) to %s." % i_ids[:50])

                    try:
                        response = client.send_command(
                            InstanceIds=i_ids[:50],
                            DocumentName=document,
                            TimeoutSeconds=cmd["Timeout"],
                            Comment=cmd["Comment"],
                            Parameters={
                                'commands': shell_input,
                                'executionTimeout': [str(cmd["Timeout"])]
                            },
                            MaxConcurrency='100%',
                            MaxErrors='100%',
                            CloudWatchOutputConfig={
                                'CloudWatchLogGroupName': self.context["SSMLogGroup"],
                                'CloudWatchOutputEnabled': True
                            }
                        )
                        self.run_cmd_states["Commands"].append({
                            "Id": response["Command"]["CommandId"],
                            "InstanceIds": i_ids[:50],
                            "ReceivedInstanceIds": [],
                            "Command": command,
                            "CommandArgs": args,
                            "Results": {},
                            "Expiration": misc.seconds_from_epoch_utc() + Cfg.get_duration_secs("ssm.state.command.default_ttl")
                        })
                        log.log(log.NOTICE, f"SSM RunCommand (Id:%s) : {command}({args})" % response["Command"]["CommandId"])
                    except Exception as e:
                        # Under rare circumstance, we can receive an Exception while trying to send
                        log.log(log.NOTICE, f"Failed to do SSM SendCommand : {e}, %s" % i_ids[:50])
                    i_ids = i_ids[50:]
        self.o_state.set_state_json("ssm.events.run_commands", self.run_cmd_states, compress=True, TTL=self.ttl)
    def manage_maintenance_windows(self):
        """ Read SSM Maintenance Window information and apply temporary configuration during maintenance period.
        """
        config_tag = "clonesquad:config:"
        def _set_tag(fleet, config, mw):
            min_instance_count = None
            if "Tags" in mw:
                tags = {}
                for t in mw["Tags"]:
                    if t["Key"].startswith(config_tag):
                        tags[t["Key"][len(config_tag):]] = t["Value"]
                if fleet is None:
                    if "ec2.schedule.min_instance_count" in tags: 
                        min_instance_count = tags["ec2.schedule.min_instance_count"]
                else:
                    tag = f"subfleet.{fleet}.ec2.schedule.min_instance_count"
                    if tag in tags:
                        min_instance_count = tags[tag]
                        del tags[tag]
                    tag = f"subfleet.__all__.ec2.schedule.min_instance_count"
                    if tag in tags:
                        min_instance_count = tags[tag]
                        del tags[tag]
                for t in tags:    
                    if not Cfg.is_builtin_key_exist(t):
                        log.warning(f"On SSM MaintenanceWindow objection %s/%s, tag '{config_tag}.{t}' does not refer "
                            "to an existing configuration key!!" % (mw["WindowId"], mw["Name"]))
                        continue
                    config[f"override:{t}"] = tags[t]
            return min_instance_count
        config = {}
        meta   = {}
        is_maintenance_time  = self.is_maintenance_time(meta=meta)
        self._record_last_maintenance_window_time(is_maintenance_time)

        # Send events with SSM and notify users
        instances         = self.o_ec2.get_instances(State="pending,running", main_fleet_only=True)
        instance_ids      = [i["InstanceId"] for i in instances]
        event_name        = "ENTER_MAINTENANCE_WINDOW_PERIOD" if is_maintenance_time else "EXIT_MAINTENANCE_WINDOW_PERIOD"
        pretty_event_name = "EnterMaintenanceWindowPeriod" if is_maintenance_time else "ExitMaintenanceWindowPeriod"
        self.send_events(instance_ids, "maintenance_window.state_change", event_name, {
            }, notification_handler=self.ssm_maintenance_window_event, pretty_event_name=pretty_event_name)

        # Main fleet Maintenance window management
        if not is_maintenance_time:
            if "NextWindowMessage" in meta:
                log.log(log.NOTICE, meta["NextWindowMessage"])
        else:
            log.log(log.NOTICE, f"Main fleet under Active Maintenance Window until %s : %s" % 
                    (meta["EndTime"], meta["MatchingWindow"]))
            min_instance_count = _set_tag(None, config, meta["MatchingWindow"])
            if min_instance_count is None:
                min_instance_count = Cfg.get("ssm.feature.maintenance_window.mainfleet.ec2.schedule.min_instance_count")
            config["override:ec2.schedule.min_instance_count"] = min_instance_count
            if min_instance_count == "100%":
                config["override:ec2.schedule.desired_instance_count"] = "100%"

        # Subfleet Maintenance window management
        for subfleet in self.o_ec2.get_subfleet_names():
            meta = {}
            is_maintenance_time = self.is_maintenance_time(fleet=subfleet, meta=meta)
            self._record_last_maintenance_window_time(is_maintenance_time, fleet=subfleet)
            # Send events with SSM and notify users
            instances           = self.o_ec2.get_instances(State="running", instances=self.o_ec2.get_subfleet_instances(subfleet_name=subfleet))
            instance_ids        = [i["InstanceId"] for i in instances]
            event_name          = "ENTER_MAINTENANCE_WINDOW_PERIOD" if is_maintenance_time else "EXIT_MAINTENANCE_WINDOW_PERIOD"
            self.send_events(instance_ids, "maintenance_window.state_change", event_name, {
                }, notification_handler=self.ssm_maintenance_window_event, pretty_event_name=pretty_event_name)

            if not is_maintenance_time:
                if "NextWindowMessage" in meta:
                    log.log(log.NOTICE, meta["NextWindowMessage"])
            else:
                log.log(log.NOTICE, f"Subflee '{subfleet}' fleet under Active Maintenance Window until %s : %s" % 
                    (meta["EndTime"], meta["MatchingWindow"]))
                min_instance_count = _set_tag(subfleet, config, meta["MatchingWindow"])
                if min_instance_count is None:
                    min_instance_count = Cfg.get(f"ssm.feature.maintenance_window.subfleet.{subfleet}.ec2.schedule.min_instance_count")
                config[f"override:subfleet.{subfleet}.ec2.schedule.min_instance_count"] = min_instance_count
                if min_instance_count == "100%":
                    config[f"override:subfleet.{subfleet}.ec2.schedule.desired_instance_count"] = "100%"
                if Cfg.get_int("ssm.feature.maintenance_window.subfleet.{SubfleetName}.force_running"):
                    config[f"override:subfleet.{subfleet}.state"] = "running"

        # Register SSM Maintenance Window configuration override
        Cfg.register(config, layer="SSM Maintenance window override", create_layer_when_needed=True)
Ejemplo n.º 35
0
def main():
    """View a simulation of contagion"""
    args = cli()
    config.configure(args.conf)
    nrows = config.get_int("Grid", "rows")
    ncols = config.get_int("Grid", "cols")

    population = model.Population(nrows, ncols)

    # View of the main model
    view = grid_view.GridView(config.get_int("Grid", "Width"),
                              config.get_int("Grid", "Height"),
                              nrows=nrows,
                              ncols=ncols,
                              title="Contagion",
                              autoflush=False)

    # Summary statistics
    stats_view = contagion_stats.Stats(population)

    # Monitor changes to cells ---
    #    - for monitoring progress
    #    - for updating the main view
    monitor = change_listener.ChangeListener()
    # Attach listeners to each cell
    for row in range(nrows):
        for col in range(ncols):
            cell_view = grid_view.CellView(row, col, view)
            population.cells[row][col].add_listener(cell_view)  # Graphics
            population.cells[row][col].add_listener(monitor)  # Change tracking
        view.update(rate=5)

    # Initial view, before simulation starts
    view.update()
    time.sleep(1)
    log.info("Seeding")
    population.seed()  # Note this should set change monitor
    view.update()
    time.sleep(1)

    # Evolve until it reaches quiescence
    log.info("Running")
    steps = 0
    epoch = 0
    while monitor.check():
        monitor.set(False)  # No changes yet in this cycle
        # An 'epoch' is 10 steps.  We stop when an epoch has
        # gone by without a noticeable state change, and we
        # chart each epoch rather than each step
        for _ in range(10):
            steps += 1
            log.debug(f"Step {steps}")
            population.step()
            view.update()
            stats_view.update(day=steps)
            time.sleep(0.1)
        epoch += 1

        # Print stats and update bar graph after each epoch
        stats_view.show(day=steps, epoch=epoch)

    # Simulation is no longer changing.  Leave view open
    # until the user presses enter
    stats_view.show_summary()
    i = input("Press enter to close")
Ejemplo n.º 36
0
def _record_call(need_shortterm_record, is_success_func, f, *args, **kwargs):
    global records
    global notify_mgr
    record = {}
    record["EventType"] = f.__name__
    record["Input"] = {"*args": list(args), "**kwargs": dict(kwargs)}

    managed_exception = None
    xray_recorder.begin_subsegment("notifycall-call:%s" % f.__name__)
    try:
        r = f(*args, **kwargs)
        record["Output"] = json.dumps(r, default=str)
    except Exception as e:
        managed_exception = e
        record["Except"] = {
            "Exception": traceback.format_exc(),
            "Stackstrace": traceback.extract_stack(),
            "Reason": json.dumps(e, default=str)
        }
    xray_recorder.end_subsegment()

    if managed_exception is not None:
        # Persist now all aggregated data to not lose them
        xray_recorder.begin_subsegment("notifycall-persist_aggregates:%s" %
                                       f.__name__)
        try:
            KVTable.persist_aggregates()
        except Exception as e:
            log.exception("Failed to persist aggregated date!")
        xray_recorder.end_subsegment()

    if notify_mgr is None or do_not_notify:
        log.debug(
            "Do not write Event in event table: notify_mgr=%s, do_not_notify=%s"
            % (notify_mgr, do_not_notify))
        if managed_exception is not None:
            raise managed_exception
        return r

    ctx = notify_mgr.context

    try:
        need_longterm_record = managed_exception is not None or not is_success_func(
            args, kwargs, r) if is_success_func is not None else False
    except Exception as e:
        log.exception(
            "Got an exception while assessing long term event management : %s"
            % e)
        need_longterm_record = True

    # Try to catch the maximum available metadata to ease later diagnosis
    #    Protect against exceptions to ensure proper logging
    record["Metadata"] = {}
    xray_recorder.begin_subsegment("notifycall-build_metadata:%s" % f.__name__)
    try:
        notify_mgr.ec2.get_prerequisites(only_if_not_already_done=True)
        record["Metadata"]["EC2"] = {
            "AllInstanceDetails":
            notify_mgr.ec2.get_instances(),
            "AllInstanceStatuses":
            notify_mgr.ec2.get_instance_statuses(),
            "DrainingInstances": [
                i["InstanceId"]
                for i in notify_mgr.ec2.get_instances(ScalingState="draining")
            ],
            "BouncedInstances": [
                i["InstanceId"]
                for i in notify_mgr.ec2.get_instances(ScalingState="bounced")
            ],
            "ExcludedInstances": [
                i["InstanceId"]
                for i in notify_mgr.ec2.get_instances(ScalingState="excluded")
            ],
            "ErrorInstances": [
                i["InstanceId"]
                for i in notify_mgr.ec2.get_instances(ScalingState="error")
            ],
            "ScalingStates":
            notify_mgr.ec2.get_all_scaling_states()
        }
    except Exception as e:
        log.exception('Failed to create record["Metadata"]["EC2"] : %s' % e)
    xray_recorder.end_subsegment()
    xray_recorder.begin_subsegment("notifycall-build_metadata_targetgroup:%s" %
                                   f.__name__)
    try:
        notify_mgr.targetgroup.get_prerequisites(only_if_not_already_done=True)
        record["Metadata"][
            "TargetGroups"] = notify_mgr.targetgroup.get_targetgroups_info()
    except Exception as e:
        log.exception(
            'Failed to create record["Metadata"]["TargetGroups"] : %s' % e)
    xray_recorder.end_subsegment()

    for key in ["Metadata"]:
        zipped_bytes = gzip.compress(
            bytes(json.dumps(record[key], default=str), "utf-8"))
        record[key] = str(base64.b64encode(zipped_bytes), "utf-8")

    now = misc.utc_now()
    now_seconds = misc.seconds_from_epoch_utc()
    max_longterm_records = Cfg.get_int("notify.event.longterm.max_records")
    if max_longterm_records <= 0:
        need_longterm_record = 0

    tables = [
        {
            "Name": ctx["EventTable"],
            "NeedWrite": need_shortterm_record,
            "TTL": Cfg.get_duration_secs("notify.event.default_ttl"),
            "DBImages": False,
            "DebugReport": False
        },
        {
            "Name": ctx["LongTermEventTable"],
            "NeedWrite": need_longterm_record,
            "TTL": Cfg.get_duration_secs("notify.event.longterm.ttl"),
            "DBImages": True,
            "DebugReport": True
        },
    ]
    xray_recorder.begin_subsegment("notifycall-update_tables:%s" % f.__name__)
    for table in tables:
        if not table["NeedWrite"]:
            continue
        UpdateExpression = "set EventSource=:entrypoint, EventType=:eventtype, InputData=:input, OutputData=:output, HandledException=:exception, "
        UpdateExpression += "Metadata=:metadata, ExpirationTime=:expirationtime"
        ExpressionAttributeValues = {
            ':entrypoint': {
                'S': ctx["FunctionName"]
            },
            ':eventtype': {
                'S': record["EventType"]
            },
            ':input': {
                'S': json.dumps(record["Input"], default=str)
            },
            ':output': {
                'S':
                json.dumps(record["Output"] if "Output" in record else {},
                           default=str)
            },
            ':exception': {
                'S':
                json.dumps(record["Except"] if "Except" in record else "",
                           default=str)
            },
            ':metadata': {
                'S': json.dumps(record["Metadata"], default=str)
            },
            ':expirationtime': {
                'N': str(now_seconds + table["TTL"])
            }
        }
        if table["DBImages"]:
            # Insert snapshots of the CloudWatch dashboard
            try:
                log.log(log.NOTICE,
                        "Generating snapshots for Dashboard graphs...")
                images = notify_mgr.cloudwatch.get_dashboard_images()
                for i in images:
                    compressed_name = i.replace(" ", "")
                    UpdateExpression += ", Graph_%s_PNG=:graph%s" % (
                        compressed_name, compressed_name)
                    ExpressionAttributeValues[":graph%s" % compressed_name] = {
                        'S': images[i]
                    }
                log.info(
                    "/!\ Generated CloudWatch dashboard PNG snapshots in DynamoDb table '%s' for further event analysis!"
                    % table["Name"])
            except Exception as e:
                log.exception(
                    "Failed to retrieve CloudWatch snapshot images! : %s" % e)

        response = ctx["dynamodb.client"].update_item(
            Key={"EventDate": {
                'S': str(now)
            }},
            UpdateExpression=UpdateExpression,
            ExpressionAttributeValues=ExpressionAttributeValues,
            ReturnConsumedCapacity='TOTAL',
            TableName=table["Name"],
        )

        log.debug(Dbg.pprint(response))
        log.log(
            log.NOTICE, "Written event '[%s] %s' to table '%s'." %
            (str(now), record["EventType"], table["Name"]))

        # Keep under control the number of LongTerm items stored in DynamoDB table
        if need_longterm_record:
            longterm_item_eventdates = [
                m["_"] for m in notify_mgr.state.get_metastring_list(
                    "notify.longterm.itemlist", default=[])
            ]
            log.log(log.NOTICE,
                    "Guessed number of records in LongTerm Event table : %d",
                    len(longterm_item_eventdates))
            longterm_item_eventdates.append(str(now))
            nb_records_to_delete = max(
                len(longterm_item_eventdates) - max_longterm_records, 0)
            for eventdate in longterm_item_eventdates[:nb_records_to_delete]:
                try:
                    response = ctx["dynamodb.client"].delete_item(
                        Key={'EventDate': {
                            'S': eventdate
                        }},
                        TableName=ctx["LongTermEventTable"])
                    log.debug(response)
                    log.log(
                        log.NOTICE,
                        "Purged LongTerm Event record '%s' as too many are already stored (notify.event.longterm.max_records=%d)"
                        % (eventdate, max_longterm_records))
                except Exception as e:
                    log.exception(
                        "Got exception while deleting LongTerm record '%s' : %e"
                        % (eventdate, e))
            notify_mgr.state.set_state(
                "notify.longterm.itemlist",
                ";".join(longterm_item_eventdates[nb_records_to_delete:]),
                TTL=Cfg.get_duration_secs("notify.event.longterm.ttl"))
            try:
                KVTable.persist_aggregates()
            except Exception as e:
                log.exception("Got exception while persisting KVTables : %s" %
                              e)

        # Manage Debug report export to S3
        url = ctx["LoggingS3Path"]
        if url != "" and table["DebugReport"] and Cfg.get_int(
                "notify.debug.send_s3_reports"):
            xray_recorder.begin_subsegment(
                "notifycall-publish_all_reports:%s" % f.__name__)
            if ctx["FunctionName"] == "Interact":
                # Avoid recursion if throwing from InteractFunction
                log.info("Publishing Debug reports synchronously...")
                debug.publish_all_reports(ctx, url, "notifymgr_report")
            else:
                client = ctx["sqs.client"]
                log.info(
                    "Notifying Interact SQS Queue '%s' for asynchronous debug report generation..."
                    % ctx["InteractSQSUrl"])
                response = client.send_message(QueueUrl=ctx["InteractSQSUrl"],
                                               MessageBody=json.dumps({
                                                   "OpType":
                                                   "Debug/PublishReportNow",
                                                   "Events": {
                                                       "Timestamp":
                                                       str(ctx["now"])
                                                   }
                                               }))
                log.debug(response)
            xray_recorder.end_subsegment()

    xray_recorder.end_subsegment()
    if managed_exception is not None:
        raise managed_exception
    return r
    def get_prerequisites(self):
        if Cfg.get_int("cron.disable"):
            return
        self.scheduler_table = kvtable.KVTable(self.context,
                                               self.context["SchedulerTable"])

        # Compute event names
        self.load_event_definitions()

        # Read all existing event rules
        client = self.context["events.client"]
        params = {
            "NamePrefix":
            "CS-Cron-%s%s-" %
            (self.context["GroupName"], self.context["VariantNumber"]),
            "Limit":
            10
        }
        rules = []
        while True:
            response = client.list_rules(**params)
            if "Rules" in response: rules.extend(response["Rules"])
            if "NextToken" not in response: break
            params["NextToken"] = response["NextToken"]
        self.rules = rules

        max_rules_per_batch = Cfg.get_int("cron.max_rules_per_batch")
        # Create missing rules
        expected_rule_names = [r["Name"] for r in self.event_names]
        existing_rule_names = [r["Name"] for r in self.rules]
        for r in expected_rule_names:
            if r not in existing_rule_names:
                max_rules_per_batch -= 1
                if max_rules_per_batch <= 0:
                    break
                rule_def = self.get_ruledef_by_name(r)

                try:
                    response = client.put_rule(
                        Name=r,
                        Description="Schedule Event '%s': %s" %
                        (rule_def["EventName"], rule_def["Event"]),
                        RoleArn=self.context["CloudWatchEventRoleArn"],
                        ScheduleExpression=rule_def["Data"][0]["schedule"],
                        State='ENABLED')
                    log.debug("put_rule: %s" % response)
                except Exception as e:
                    log.exception(
                        "Failed to create scheduler event '%s' (%s) : %s" %
                        (r, rule_def["Data"][0]["schedule"], e))

                try:
                    response = client.put_targets(
                        Rule=r,
                        Targets=[{
                            'Arn': self.context["InteractLambdaArn"],
                            'Id': "id%s" % r,
                        }])
                    log.debug("put_targets: %s" % response)
                except Exception as e:
                    log.exception(
                        "Failed to set targets for event rule '%s' : %s" %
                        (r, e))

        # Garbage collect obsolete rules
        for r in existing_rule_names:
            if r not in expected_rule_names:
                max_rules_per_batch -= 1
                if max_rules_per_batch <= 0:
                    break
                try:
                    client.remove_targets(Rule=r, Ids=["id%s" % r])
                    client.delete_rule(Name=r)
                except Exception as e:
                    log.exception("Failed to delete rule '%s' : %s" % (r, e))
Ejemplo n.º 38
0
    def get_prerequisites(self, only_if_not_already_done=False):
        if only_if_not_already_done and self.prereqs_done:
            return

        self.state_table = self.o_state.get_state_table()
        client = self.context["ec2.client"]

        # Retrieve list of instances with appropriate tag
        Filters = [{
            'Name': 'tag:clonesquad:group-name',
            'Values': [self.context["GroupName"]]
        }]

        instances = []
        response = None
        while (response is None or "NextToken" in response):
            response = client.describe_instances(
                Filters=Filters,
                MaxResults=Cfg.get_int("ec2.describe_instances.max_results"),
                NextToken=response["NextToken"]
                if response is not None else "")
            for reservation in response["Reservations"]:
                instances.extend(reservation["Instances"])

        # Filter out instances with inappropriate state
        non_terminated_instances = []
        for i in instances:
            if i["State"]["Name"] not in ["shutting-down", "terminated"]:
                non_terminated_instances.append(i)

        self.instances = non_terminated_instances
        self.instance_ids = [i["InstanceId"] for i in self.instances]

        # Enrich describe_instances output with instance type details
        if Cfg.get_int("ec2.describe_instance_types.enabled"):
            self.instance_types = []
            [
                self.instance_types.append(i["InstanceType"])
                for i in self.instances
                if i["InstanceType"] not in self.instance_types
            ]
            if len(self.instance_types):
                response = client.describe_instance_types(
                    InstanceTypes=self.instance_types)
                self.instance_type_details = response["InstanceTypes"]
                for i in self.instances:
                    i["_InstanceType"] = next(
                        filter(
                            lambda it: it["InstanceType"] == i["InstanceType"],
                            self.instance_type_details), None)

        # Get instances status
        instance_statuses = []
        response = None
        while response is None or "NextToken" in response:
            q = {"InstanceIds": self.instance_ids}
            if response is not None and "NextToken" in response:
                q["NextToken"] = response["NextToken"]
            response = client.describe_instance_status(**q)
            instance_statuses.extend(response["InstanceStatuses"])
        self.instance_statuses = instance_statuses

        # Get AZ status
        response = client.describe_availability_zones()
        self.availability_zones = response["AvailabilityZones"]
        if len(self.availability_zones) == 0:
            raise Exception("Can't have a region with no AZ...")

        self.az_with_issues = []
        if not Cfg.get_int("ec2.az.statusmgt.disable"):
            for az in self.availability_zones:
                if az["State"] in ["impaired", "unavailable"]:
                    self.az_with_issues.append(az)
                if az["State"] != "available":
                    log.warning(
                        "AZ %s(%s) is marked with status '%s' by EC2.describe_availability_zones() API!"
                        % (zone_name, zone_id, zone_state))
        else:
            log.warning(
                "Automatic AZ issues detection through describe_availability_zones() is DISABLED (ec2.az.statusmgt.disable != 0)..."
            )

        # Use these config keys to simulate an AWS Large Scale Event
        all_az_names = [az["ZoneName"] for az in self.availability_zones]
        all_az_ids = [az["ZoneId"] for az in self.availability_zones]
        [
            log.warning(
                "ec2.debug.availability_zones_impaired do not match local AZs! '%s'"
                % a)
            for a in Cfg.get_list("ec2.debug.availability_zones_impaired",
                                  default=[])
            if a not in all_az_names and a not in all_az_ids
        ]
        [
            log.warning(
                "ec2.az.unavailable_list do not match local AZs! '%s'" % a)
            for a in Cfg.get_list("ec2.az.unavailable_list", default=[])
            if a not in all_az_names and a not in all_az_ids
        ]
        for az in self.availability_zones:
            zone_name = az["ZoneName"]
            zone_id = az["ZoneId"]
            zone_state = az["State"]
            if zone_name in Cfg.get_list(
                    "ec2.debug.availability_zones_impaired", default=[]):
                zone_state = "impaired"
            if zone_id in Cfg.get_list("ec2.debug.availability_zones_impaired",
                                       default=[]):
                zone_state = "impaired"
            if zone_name in Cfg.get_list("ec2.az.unavailable_list",
                                         default=[]):
                zone_state = "unavailable"
            if zone_id in Cfg.get_list("ec2.az.unavailable_list", default=[]):
                zone_state = "unavailable"
            if zone_state != az["State"] and zone_state in [
                    "impaired", "unavailable"
            ] and az not in self.az_with_issues:
                self.az_with_issues.append(az)
            az["State"] = zone_state
            if zone_state != "available":
                log.warning(
                    "AZ %s(%s) is marked with status '%s' by configuration keys!"
                    % (zone_name, zone_id, zone_state))

        # We need to register dynamically static subfleet configuration keys to avoid a 'key unknown' warning
        #   when the user is going to set it
        static_subfleet_names = self.get_static_subfleet_names()
        for static_fleet in static_subfleet_names:
            key = "staticfleet.%s.state" % static_fleet
            if not Cfg.is_builtin_key_exist(key):
                Cfg.register({key: ""})
        log.log(
            log.NOTICE,
            "Detected following static subfleet names across EC2 resources: %s"
            % static_subfleet_names)

        self.prereqs_done = True
Ejemplo n.º 39
0
def main_handler_entrypoint(event, context):
    """

    Parameters
    ----------
    event: dict, required

    context: object, required
        Lambda Context runtime methods and attributes

        Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html

    Returns
    ------

    """

    #print(Dbg.pprint(event))

    ctx["now"] = misc.utc_now()
    ctx["FunctionName"] = "Main"

    init()

    if Cfg.get_int("app.disable") != 0 and not misc.is_sam_local():
        log.warning("Application disabled due to 'app.disable' key")
        return

    no_is_called_too_early = False
    # Manage Spot interruption as fast as we can
    if sqs.process_sqs_records(event, function=ec2_schedule.manage_spot_notification, function_arg=ctx):
        log.info("Managed Spot Interruption SQS record!")
        # Force to run now disregarding `app.run_period` as we have at least one Spot instance to 
        #   remove from target groups immediatly
        no_is_called_too_early = True
    
    # Check that we are not called too early
    #   Note: We peform a direct read to the KVTable to spare initialization time when the
    #   Lambda is called too early
    ctx["main.last_call_date"] = ctx["o_ec2"].get_state("main.last_call_date", direct=True)
    if ctx["main.last_call_date"] is None or ctx["main.last_call_date"] == "": 
        ctx["main.last_call_date"] = str(misc.epoch())

    if not no_is_called_too_early and is_called_too_early():
        log.log(log.NOTICE, "Called too early by: %s" % event)
        notify.do_not_notify = True
        sqs.process_sqs_records(event)
        sqs.call_me_back_send()
        return

    log.debug("Load prerequisites.")
    load_prerequisites(["o_state", "o_notify", "o_ec2", "o_cloudwatch", "o_targetgroup", "o_ec2_schedule", "o_scheduler", "o_rds"])

    # Remember 'now' as the last execution date
    ctx["o_ec2"].set_state("main.last_call_date", value=ctx["now"], TTL=Cfg.get_duration_secs("app.default_ttl"))

    Cfg.dump()

    # Perform actions:
    log.debug("Main processing.")
    ctx["o_targetgroup"].manage_targetgroup()
    ctx["o_ec2_schedule"].schedule_instances()
    ctx["o_ec2_schedule"].stop_drained_instances()
    ctx["o_cloudwatch"].configure_alarms()
    ctx["o_rds"].manage_subfleet_rds()
    ctx["o_ec2_schedule"].prepare_metrics()

    ctx["o_cloudwatch"].send_metrics()
    ctx["o_cloudwatch"].configure_dashboard()

    # If we got woke up by SNS, acknowledge the message(s) now
    sqs.process_sqs_records(event)

    ctx["o_notify"].notify_user_arn_resources()

    # Call me back if needed
    sqs.call_me_back_send()