def get(self): results = [] for jt in JOB_TYPES: for e in ENGINES: for js in JOB_SIZE: q = Record.all() q.filter('benchmark =', jt) q.filter('engine_type =', e) q.filter('num_entities =', js) # hope you didnt run 1k+ jobs ents = q.fetch(1000) if len(ents) == 0: continue n = 0 sum_x = 0 mean = 0 stdev = 0 maximum = 0 minimum = 99999999 points = [] for ii in ents: if ii.total: # some jobs may have failed n += 1 points.append(ii.total) if ii.total > maximum: maximum = ii.total if ii.total < minimum: minimum = ii.total if n != 0: sum_x = getTotal(points) mean = getAverage(points, sum_x) stdev = getStDev(points, mean) results.append(Job(n, sum_x, mean, maximum, minimum, stdev, jt, e, js)) self.response.out.write(template.render('templates/jobs.html', {'jobs':results, 'jobs_len':len(results)})) return
def get(self): user = users.get_current_user() if not user: self.redirect(users.create_login_url(dest_url="/")) return q = SubSetDataSet.all() q.order('-start') results = q.fetch(1000) datasets = [result for result in results] datasets_len = len(datasets) q = Record.all() q.filter('benchmark =', "subset") q.order('-start') results = q.fetch(1000) records = [result for result in results] records_len = len(records) fsm_count = get_fsm_count() pl_count = get_pl_count() mr_count = get_mr_count() self.response.out.write(template.render("templates/subset.html", {"user": user.email(), "datasets_len" : datasets_len, "datasets" : datasets, "fsm_count" : fsm_count, "pl_count" : pl_count, "mr_count" : mr_count, "records": records, "records_len" : records_len}))
def post(self): name = self.request.headers["mapreduce-id"] if not name: name = "NAME NOT FOUND" logging.info("MR CALLBACK " + name) q = Record.all() q.filter('mr_id =', name) q.fetch(1) if q: for ii in q: t = memcache.get('fsm_mapper_cleanup') if not t: logging.error("Unable to get datetime from memcache") return False dt, msec = t.split(".") dt = datetime.datetime.strptime(dt, '%Y-%m-%d %H:%M:%S') msec = datetime.timedelta(microseconds = int(msec)) fullDatetime = dt + msec ii.end = fullDatetime delta = (ii.end - ii.start) ii.total = float(delta.days * 86400 + delta.seconds) + float(delta.microseconds)/1000000 ii.state = "Done" ii.put() logging.info("updated: record for MR job id %s"%name) else: logging.info("Unable to find record for MR job id %s"%name)
def finalized(self): plid = self.pipeline_id q = Record.all() q.filter('pipeline_id =',plid) items = q.fetch(1) for ii in items: ii.end = datetime.datetime.now() delta = (ii.end - ii.start) ii.total = float(delta.days * 86400 + delta.seconds) + float(delta.microseconds)/1000000 ii.state = "Done" ii.put() logging.info("Updated aggregate pipeline record") logging.info("Done with aggregate pipeline")
def post(self): name = self.request.headers["mapreduce-id"] if not name: name = "NAME NOT FOUND" logging.info("MR CALLBACK " + name) q = Record.all() q.filter('mr_id =', name) q.fetch(1) if q: for ii in q: ii.end = datetime.datetime.now() delta = (ii.end - ii.start) ii.total = float(delta.days * 86400 + delta.seconds) + float(delta.microseconds)/1000000 ii.state = "Done" ii.put() logging.info("updated: record for MR job id %s"%name) else: logging.info("Unable to find record for MR job id %s"%name)
def fsm_calculate_run_time(): """ Fantasm does not give call backs when its done. Must figure it out with another job using the last modified date on output entities """ # Get the last job which was run for subset /fsm q = Record.all() q.filter('engine_type =','fsm') q.filter('benchmark =','subset') q.order('-start') results = q.fetch(1) if len(results) == 0: logging.error("Unable to find a record for fsm/subset") return False q = None record = None for ii in results: if ii.state == "Done": logging.error("Last FSM end time has already been calculated") logging.info(str(ii.num_entities)) q = SSFSMSimpleCounterShard.all() if not q: logging.error("No query returned for SubSet results") return False record = ii max_date = None while True: results = q.fetch(1000) for ii in results: date = ii.modified if max_date == None or max_date < date: max_date = date if len(results) < 1000: break; if not max_date: logging.error("Unable to calculate the max date for FSM/subset") return False record.state = "Done" record.end = max_date delta = (record.end - record.start) record.total = float(delta.days * 86400 + delta.seconds) + float(delta.microseconds)/1000000 record.put() return True
def fsm_calculate_run_time(): """ Fantasm does not give call backs when its done. Must figure it out with another job using the last modified date on output entities """ # Get the last job which was run for grep/fsm q = Record.all() q.filter('engine_type =','fsm') q.filter('benchmark =','grep') q.order('-start') results = q.fetch(1) if len(results) == 0: logging.error("Unable to find a record for fsm/grep") return False for ii in results: ii.state = "Calculating time" ii.put() shards = ii.num_entities/1000 if shards < 1: shards = 1 if shards > 256: shards = 256 # max amount of shards allowed kind = "GrepResults" #get_output_class(ii.num_entities) mapreduce_id = control.start_map( name="FSM Grep cleanup", handler_spec="grep.fsm_mapper", reader_spec="mapreduce.input_readers.DatastoreInputReader", mapper_parameters={ "entity_kind": "data.grep."+kind, "processing_rate": 500 }, mapreduce_parameters={model.MapreduceSpec.PARAM_DONE_CALLBACK: '/grep/fsm/callback'}, shard_count=shards, queue_name="default", ) ii.mr_id = mapreduce_id ii.put() return True
def get(self): user = users.get_current_user() if not user: self.redirect(users.create_login_url(dest_url="/")) return q = GrepDataSet.all() q.order('-start') results = q.fetch(1000) datasets = [result for result in results] datasets_len = len(datasets) q = Record.all() q.filter('benchmark =', "grep") q.order('-start') results = q.fetch(1000) records = [result for result in results] records_len = len(records) self.response.out.write(template.render("templates/grep.html", {"user": user.email(), "datasets_len" : datasets_len, "datasets" : datasets, "records": records, "records_len" : records_len}))
def fsm_calculate_run_time(): """ Fantasm does not give call backs when its done. Must figure it out with another job using the last modified date on output entities """ # Get the last job which was run for aggregate /fsm q = Record.all() q.filter('engine_type =','fsm') q.filter('benchmark =','aggregate') q.order('-start') results = q.fetch(1) # There is a second type of fsm job that has a fan in state q2 = Record.all() q2.filter('engine_type =','fsm_fan_in') q2.filter('benchmark =','aggregate') q2.order('-start') results2 = q2.fetch(1) if len(results) == 0 and len(results2) == 0: logging.error("Unable to find a record for fsm/aggregate") return False # Take only the one which ran last if len(results) == 0: results = results2 #fsm with fan in ran last elif len(results2) == 0: pass elif results[0].start > results2[0].start: pass else: results = results2 #fsm with fan in ran last q = None record = None # There should only be one result for ii in results: if ii.state == "Done": logging.error("Last FSM end time has already been calculated") logging.info(str(ii.num_entities)) q = FSMSimpleCounterShard.all() if not q: logging.error("No query returned for Aggregate results") return False record = ii max_date = None while True: results = q.fetch(1000) for ii in results: date = ii.modified if max_date == None or max_date < date: max_date = date if len(results) < 1000: break; if not max_date: logging.error("Unable to calculate the max date for FSM/aggregate") return False record.state = "Done" record.end = max_date delta = (record.end - record.start) record.total = float(delta.days * 86400 + delta.seconds) + float(delta.microseconds)/1000000 record.put() return True