def get_videos(self): """Get a list of videos included in the paths here: given under "modelname/videos". """ configdict = self.load_config() ensemblesize = max(configdict["ensemble_size"],9) ## video list: assumed identical bc we copied all from the same. vidlist = utilsparams3.ls_name(self.bucket,os.path.join(self.jobdir,"process_results","ensemble-model1-2030-01-01","videos/")) ## this is assumed to be a fixed output of dgp models. return vidlist
def copy_logs(self): """Copy logs generated by the first step (results/job__{bucketname}_{timestamp}/logs) into a folder (results/job__{bucketname}_{timestamp}/logs_pre_{stepname}) in the target bucket. """ ## lognames = utilsparams3.ls_name(self.bucket,os.path.join(self.jobdir,"logs")) lognames_base = [os.path.basename(l) for l in lognames] ## now create keys: lognames_full = [os.path.join(self.jobdir,"logs_pre_{}".format(self.stepname),lb) for lb in lognames_base] for ln,lf in zip(lognames,lognames_full): s3_resource.meta.client.copy({"Bucket":self.bucket,"Key":ln},self.targetbucket,lf)
def monitor_updater(event, context): """ Newest version of events monitoring that updates pre-existing logs. """ ## 1. First, find the instance id. ## 2. Go find the appropriate log folder in the bucket [bucket available through os. ] ## 3. Now figure out if this is an "running" or "shutting-down" statechange. " ## 4. accordingly, either update the log [running] or update the log and move it to the appropriate folder [given by the log contents.] ## Include exception handling for the case where one of the fields is not completed. time = event['time'] instanceid = event['detail']['instance-id'] logname = "{}.json".format(instanceid) statechange = event['detail']['state'] bucket_name = os.environ["BUCKET_NAME"] if statechange in ["running", "shutting-down"]: print(logname) print(utilsparams3.ls_name(bucket_name, "logs/active/")) log = utilsparams3.update_monitorlog(bucket_name, logname, statechange, time) path_to_data = log["datapath"] jobname = os.path.basename(log["jobpath"]).replace( ":", "_") ## Monitoring names cannot have groupname = re.findall('.+?(?=/' + os.environ["INDIR"] + ')', path_to_data)[0] ## Log name for the group that make this job: current_job_log = os.path.join("logs", "active", logname) completed_job_log = os.path.join("logs", groupname, logname) if statechange == "shutting-down": utilsparams3.mv(bucket_name, current_job_log, completed_job_log) timepkg.sleep(5) ## Now check if we can delete this rule: rulename = "Monitor{}".format(jobname) instances_under_rule = utilsparamevents.get_monitored_instances( rulename) condition = [ utilsparams3.exists( bucket_name, os.path.join("logs", "active", "{}.json".format(inst))) for inst in instances_under_rule ] ## Delete the rule if not any(condition): ## get the target: response = utilsparamevents.full_delete_rule(rulename) else: pass else: print("unhandled state change. quitting") raise ValueError("statechange {} not expected".format(statechange))
def get_costmonitoring(self): """ Gets the cost incurred by a given group so far by looking at the logs bucket of the appropriate s3 folder. """ ## first get the path to the log folder we should be looking at. group_name = self.path assert len(group_name) > 0 "group_name must exist." logfolder_path = "logs/{}/".format(group_name) full_reportpath = os.path.join(logfolder_path, "i-") ## now get all of the computereport filenames: all_files = utilsparams3.ls_name(self.bucket_name, full_reportpath) ## for each, we extract the contents: jobdata = {} cost = 0 ## now calculate the cost: for jobfile in all_files: instanceid = jobfile.split(full_reportpath)[1].split(".json")[0] jobdata = utilsparams3.load_json(self.bucket_name, jobfile) price = jobdata["price"] start = jobdata["start"] end = jobdata["end"] try: starttime = datetime.strptime(start, "%Y-%m-%dT%H:%M:%SZ") endtime = datetime.strptime(end, "%Y-%m-%dT%H:%M:%SZ") diff = endtime - starttime duration = abs(diff.seconds) cost = price * duration / 3600. except TypeError: ## In rare cases it seems one or the other of these things don't actually have entries. This is a problem. for now, charge for the hour: cost = price cost += cost ## Now compare with budget: budget = float(os.environ["MAXCOST"]) if cost < budget: message = "Incurred cost so far: ${}. Remaining budget: ${}".format( cost, budget - cost) self.logger.append(message) self.logger.write() validjob = True elif cost >= budget: message = "Incurred cost so far: ${}. Over budget (${}), cancelling job. Contact administrator.".format( cost, budget) self.logger.append(message) self.logger.write() validjob = False return validjob
def get_costmonitoring(self): """ Gets the cost incurred by a given group so far by looking at the logs bucket of the appropriate s3 folder. """ ## first get the path to the log folder we should be looking at. group_name = self.path assert len(group_name) > 0 "[JOB TERMINATE REASON] Can't locate the group that triggered analysis, making it impossible to determine incurred cost." logfolder_path = "logs/{}/".format(group_name) full_reportpath = os.path.join(logfolder_path, "i-") ## now get all of the computereport filenames: all_files = utilsparams3.ls_name(self.bucket_name, full_reportpath) ## for each, we extract the contents: jobdata = {} cost = 0 ## now calculate the cost: for jobfile in all_files: instanceid = jobfile.split(full_reportpath)[1].split(".json")[0] jobdata = utilsparams3.load_json(self.bucket_name, jobfile) price = jobdata["price"] start = jobdata["start"] end = jobdata["end"] try: starttime = datetime.strptime(start, "%Y-%m-%dT%H:%M:%SZ") endtime = datetime.strptime(end, "%Y-%m-%dT%H:%M:%SZ") diff = endtime - starttime duration = abs(diff.seconds) instcost = price * duration / 3600. except TypeError: ## In rare cases it seems one or the other of these things don't actually have entries. This is a problem. for now, charge for the hour: message = " [Internal (get_costmonitoring)] Duration of past jobs not found. Pricing for an hour" self.logger.append(message) self.logger.printlatest() instcost = price cost += instcost ## Now compare against the cost of the job you're currently running: ## need duration from config (self.parse_config), self.instance_type, and self.nb_instances ## By assuming they're all standard instances we upper bound the cost. try: price = utilsparampricing.get_price( utilsparampricing.get_region_name(utilsparampricing.region_id), self.instance_type, os="Linux") nb_instances = len(self.filenames) if self.jobduration is None: duration = defaultduration / 60 ## in hours. else: duration = self.jobduration / 60 jobpricebound = duration * price * nb_instances cost += jobpricebound except Exception as e: print(e) raise Exception( " [Internal (get_costmonitoring)] Unexpected Error: Unable to estimate cost of current job." ) ## Now compare agains the expected cost of instances with the current ami: try: ami = os.environ["AMI"] total_activeprice = self.prices_active_instances_ami(ami) except Exception as e: print(e) try: activeprice = utilsparampricing.get_price( utilsparampricing.get_region_name( utilsparampricing.region_id), self.instance_type, os="Linux") number = len( [i for i in utilsparamec2.get_active_instances_ami(ami)]) activeduration = defaultduration * number / 60 ## default to the default duration instead if not given. total_activeprice = activeprice * activeduration except Exception as e: print(e) raise Exception( " [Internal (get_costmonitoring)] Unexpected Error: Unable to estimate cost of active jobs." ) cost += total_activeprice ## Now compare with budget: try: budget = float( utilsparamssm.get_budget_parameter(self.path, self.bucket_name)) except ClientError as e: try: assert e.response["Error"]["Code"] == "ParameterNotFound" budget = float(os.environ["MAXCOST"]) message = " [Internal (get_costmonitoring)] Customized budget not found. Using default budget value of {}".format( budget) self.logger.append(message) self.logger.printlatest() except: raise Exception( " [Internal (get_costmonitoring)] Unexpected Error: Unable to get budget." ) except Exception: raise Exception( " [Internal (get_costmonitoring)] Unexpected Error: Unable to get budget." ) if cost < budget: message = " [Internal (get_costmonitoring)] Projected total costs: ${}. Remaining budget: ${}".format( cost, budget - cost) self.logger.append(message) self.logger.printlatest() self.logger.write() validjob = True elif cost >= budget: message = " [Internal (get_costmonitoring)] Projected total costs: ${}. Over budget (${}), cancelling job. Contact administrator.".format( cost, budget) self.logger.append(message) self.logger.printlatest() self.logger.write() validjob = False return validjob