def __init__(self, pushlog_settings): with Timer("get pushlog"): if pushlog_settings.disable: all_pushlogs = [] else: with DB(pushlog_settings) as db: all_pushlogs = db.query(""" SELECT pl.`date`, left(ch.node, 12) revision, coalesce(bm.alt_name, br.name) branch FROM changesets ch LEFT JOIN pushlogs pl ON pl.id = ch.pushlog_id LEFT JOIN branches br ON br.id = pl.branch_id LEFT JOIN branch_map bm ON br.id = bm.id WHERE pl.date > {{oldest_date}} """, {"oldest_date": TOO_OLD}) Log.note("Got pushlog, now indexing...") self.pushlog = wrap(Q.index(all_pushlogs, ["branch", "revision"])._data) self.locker = Lock() self.unknown_branches = set()
def get_existing_ids(es, settings, branches): #FIND WHAT'S IN ES bad_ids = [] int_ids = set() demand_pushlog = {"match_all":{}} if branches: demand_pushlog = {"or": [ {"not": {"missing": {"field": "test_build.push_date"}}}, {"not": {"missing": {"field": "test_build.no_pushlog"}}} ]} if settings.elasticsearch.debug and settings.production.step < 10: # SIMPLY RELOAD THIS SMALL NUMBER return set([]) with ESQuery(es) as esq: max_id = esq.query({ "from": es.settings.alias, "select": {"value": "datazilla.id", "aggregate": "max"} }) interval_size = 200000 for mini, maxi in Q.intervals(settings.production.min, max_id+interval_size, interval_size): existing_ids = es.search({ "query": { "filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"range": {"datazilla.id": {"gte": mini, "lt": maxi}}}, demand_pushlog ]} } }, "from": 0, "size": 0, "sort": [], "facets": { "ids": {"terms": {"field": "datazilla.id", "size": interval_size}} } }) for t in existing_ids.facets.ids.terms: try: int_ids.add(int(t.term)) except Exception, e: bad_ids.append(t.term) existing_ids = int_ids Log.println("Number of ids in ES: " + str(len(existing_ids))) Log.println("BAD ids in ES: " + str(bad_ids)) return existing_ids
def transform(self, id, datazilla): try: r = datazilla.json_blob #ADD DATAZILLA MARKUP r.datazilla = { "id": id, "date_loaded": datazilla.date_loaded * 1000, "error_flag": datazilla.error_flag, "test_run_id": datazilla.test_run_id, "processed_flag": datazilla.processed_flag, "error_msg": datazilla.error_msg } #CONVERT UNIX TIMESTAMP TO MILLISECOND TIMESTAMP r.testrun.date *= 1000 def mainthread_transform(r): if r == None: return None output = Struct() for i in r.mainthread_readbytes: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].readbytes = i[0] r.mainthread_readbytes = None for i in r.mainthread_writebytes: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].writebytes = i[0] r.mainthread_writebytes = None for i in r.mainthread_readcount: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].readcount = i[0] r.mainthread_readcount = None for i in r.mainthread_writecount: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].writecount = i[0] r.mainthread_writecount = None r.mainthread = output.values() mainthread_transform(r.results_aux) mainthread_transform(r.results_xperf) #ADD PUSH LOG INFO try: branch = r.test_build.branch if branch.endswith("-Non-PGO"): r.test_build.branch = branch r.test_build.pgo = False branch = branch[0:-8] else: r.test_build.pgo = True with Profiler("get from pushlog"): if not self.pushlog: #NO PUSHLOG MEANS WE DO NOTHING TO MARKUP TEST RESULTS pass elif self.pushlog[branch]: possible_dates = self.pushlog[branch][r.test_build.revision] if possible_dates: r.test_build.push_date = int(Math.round(possible_dates[0].date * 1000)) else: if r.test_build.revision == 'NULL': r.test_build.no_pushlog = True # OOPS! SOMETHING BROKE elif CNV.milli2datetime(Math.min(r.testrun.date, r.datazilla.date_loaded)) < PUSHLOG_TOO_OLD: Log.note("{{branch}} @ {{revision}} has no pushlog, transforming anyway", r.test_build) r.test_build.no_pushlog = True else: Log.note("{{branch}} @ {{revision}} has no pushlog, try again later", r.test_build) return [] # TRY AGAIN LATER else: with self.locker: if branch not in self.unknown_branches: Log.note("Whole branch {{branch}} has no pushlog", {"branch":branch}) self.unknown_branches.add(branch) if CNV.milli2datetime(Math.min(r.testrun.date, r.datazilla.date_loaded)) < PUSHLOG_TOO_OLD: r.test_build.no_pushlog = True else: r.test_build.no_pushlog = True #return [r] #TODO: DO THIS IF WE FIGURE OUT HOW TO HANDLE THE VERY LARGE NUMBER OF RESULTS WITH NO PUSHLOG except Exception, e: Log.warning("{{branch}} @ {{revision}} has no pushlog", r.test_build, e) new_records = [] # RECORD THE UNKNOWN PART OF THE TEST RESULTS remainder = r.copy() remainder.results = None if len(remainder.keys()) > 4: new_records.append(remainder) #RECORD TEST RESULTS total = StructList() if r.testrun.suite in ["dromaeo_css", "dromaeo_dom"]: #dromaeo IS SPECIAL, REPLICATES ARE IN SETS OF FIVE #RECORD ALL RESULTS for i, (test_name, replicates) in enumerate(r.results.items()): for g, sub_results in Q.groupby(replicates, size=5): new_record = Struct( test_machine=r.test_machine, datazilla=r.datazilla, testrun=r.testrun, test_build=r.test_build, result={ "test_name": unicode(test_name) + "." + unicode(g), "ordering": i, "samples": sub_results } ) try: s = stats(sub_results) new_record.result.stats = s total.append(s) except Exception, e: Log.warning("can not reduce series to moments", e) new_records.append(new_record)
"ordering": -1, "stats": geo_mean(total) } ) new_records.append(new_record) # ADD RECORD FOR GRAPH SERVER SUMMARY new_record = Struct( test_machine=r.test_machine, datazilla=r.datazilla, testrun=r.testrun, test_build=r.test_build, result={ "test_name": "summary_old", "ordering": -1, "stats": Stats(samples=Q.sort(total.mean)[:len(total)-1:]) } ) new_records.append(new_record) return new_records except Exception, e: Log.error("Transformation failure on id={{id}}", {"id":id}, e) def stats(values): """ RETURN LOTS OF AGGREGATES """ if values == None: return None
"length": len(CNV.object2JSON(line)), "prefix": CNV.object2JSON(line)[0:130] }, e) missing_ids = missing_ids - existing_ids #COPY MISSING DATA TO ES try: with ThreadedQueue(es, size=nvl(es.settings.batch_size, 100)) as es_sink: with ThreadedQueue(File(settings.param.output_file), size=50) as file_sink: simple_etl = functools.partial(etl, *[es_sink, file_sink, settings, transformer, max_existing_id]) num_not_found = 0 with Multithread(simple_etl, threads=settings.production.threads) as many: results = many.execute([ {"id": id} for id in Q.sort(missing_ids)[:nvl(settings.production.step, NUM_PER_BATCH):] ]) for result in results: if not result: num_not_found += 1 if num_not_found > nvl(settings.production.max_tries, 10): many.inbound.pop_all() # CLEAR THE QUEUE OF OTHER WORK many.stop() break else: num_not_found = 0 except (KeyboardInterrupt, SystemExit): Log.println("Shutdown Started, please be patient") except Exception, e: Log.error("Unusual shutdown!", e)