def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "range" self.NULL = Null if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") parts = listwrap(self.partitions) for i, p in enumerate(parts): self.min = Math.min(self.min, p.min) self.max = Math.max(self.max, p.max) if p.dataIndex != None and p.dataIndex != i: Log.error("Expecting `dataIndex` to agree with the order of the parts") if p[self.key] == None: Log.error("Expecting all parts to have {{key}} as a property", key=self.key) p.dataIndex = i # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE for p, q in itertools.product(parts, parts): if p.min <= q.min and q.min < p.max: Log.error("partitions overlap!") self.partitions = parts return elif any([self.min == None, self.max == None, self.interval == None]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = wrap([{"min": v, "max": v + self.interval, "dataIndex": i} for i, v in enumerate(frange(self.min, self.max, self.interval))])
def accumulate_logs(source_key, file_name, lines, please_stop): accumulator = LogSummary() for line in lines: if please_stop: Log.error( "Shutdown detected. Structured log iterator is stopped.") accumulator.stats.bytes += len( line ) + 1 # INCLUDE THE \n THAT WOULD HAVE BEEN AT END OF EACH LINE line = strings.strip(line) if line == "": continue try: accumulator.stats.lines += 1 log = convert.json2value(line) log.time = log.time / 1000 accumulator.stats.start_time = Math.min( accumulator.stats.start_time, log.time) accumulator.stats.end_time = Math.max(accumulator.stats.end_time, log.time) # FIX log.test TO BE A STRING if isinstance(log.test, list): log.test = " ".join(log.test) accumulator.__getattribute__(log.action)(log) if log.subtest: accumulator.last_subtest = log.time except Exception, e: accumulator.stats.bad_lines += 1
def copy2es(es, settings, work_queue, please_stop=None): # EVERYTHING FROM ELASTICSEARCH bucket = s3.Bucket(settings.source) for key in iter(work_queue.pop, ""): if please_stop: return if key == None: continue key = unicode(key) extend_time = Timer("insert", silent=True) Log.note("Indexing {{key}}", key=key) with extend_time: if settings.sample_only: sample_filter = { "terms": { "build.branch": settings.sample_only } } elif settings.sample_size: sample_filter = True else: sample_filter = None if key.find(":") >= 0: more_keys = bucket.keys(prefix=key) else: more_keys = bucket.keys(prefix=key + ":") num_keys = es.copy(more_keys, bucket, sample_filter, settings.sample_size) if num_keys > 1: Log.note( "Added {{num}} keys from {{key}} block in {{duration}} ({{rate|round(places=3)}} keys/second)", num=num_keys, key=key, duration=extend_time.duration, rate=num_keys / Math.max(extend_time.duration.seconds, 0.01)) work_queue.commit()
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "range" self.NULL = Null if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") parts = listwrap(self.partitions) for i, p in enumerate(parts): self.min = Math.min(self.min, p.min) self.max = Math.max(self.max, p.max) if p.dataIndex != None and p.dataIndex != i: Log.error( "Expecting `dataIndex` to agree with the order of the parts" ) if p[self.key] == None: Log.error( "Expecting all parts to have {{key}} as a property", key=self.key) p.dataIndex = i # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE for p, q in itertools.product(parts, parts): if p.min <= q.min and q.min < p.max: Log.error("partitions overlap!") self.partitions = parts return elif any([self.min == None, self.max == None, self.interval == None]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = wrap([{ "min": v, "max": v + self.interval, "dataIndex": i } for i, v in enumerate(frange(self.min, self.max, self.interval))])
def add(self, value): self.max = Math.max(self.max, value)
def full_etl(settings, sink, bugs): with Timer("process block {{start}}", {"start": min(bugs)}): es = elasticsearch.Index(settings.source) with FromES(es) as esq: versions = esq.query({ "from": "bugs", "select": "*", "where": {"terms": {"bug_id": bugs}} }) starts = qb.run({ "select": [ "bug_id", "bug_status", {"name": "attach_id", "value": "attachments.attach_id"}, {"name": "request_time", "value": "modified_ts"}, {"name": "request_type", "value": "attachments.flags.request_type"}, {"name": "reviewer", "value": "attachments.flags.requestee"}, {"name": "created_by", "value": "attachments.created_by"}, "product", "component" ], "from": versions, "where": {"and": [ {"terms": {"attachments.flags.request_status": ["?"]}}, {"terms": {"attachments.flags.request_type": TYPES}}, {"equal": ["attachments.flags.modified_ts", "modified_ts"]}, {"term": {"attachments.isobsolete": 0}} ]}, "sort": ["bug_id", "attach_id", "created_by"] }) ends = qb.run({ "select": [ {"name": "bug_id", "value": "bug_id"}, "bug_status", {"name": "attach_id", "value": "attachments.attach_id"}, {"name": "modified_ts", "value": lambda r: Math.max(r.modified_ts, r.attachments.modified_ts, r.attachments.flags.modified_ts)}, {"name": "reviewer", "value": "attachments.flags.requestee"}, {"name": "request_type", "value": "attachments.flags.request_type"}, {"name": "modified_by", "value": "attachments.flags.modified_by"}, {"name": "product", "value": "product"}, {"name": "component", "value": "component"}, {"name": "review_end_reason", "value": lambda r: 'done' if r.attachments.flags.request_status != '?' else ('obsolete' if r.attachments.isobsolete == 1 else 'closed')}, {"name": "review_result", "value": lambda r: '+' if r.attachments.flags.request_status == '+' else ('-' if r.attachments.flags.request_status == '-' else '?')} ], "from": versions, "where": {"and": [ {"terms": {"attachments.flags.request_type": TYPES}}, {"or": [ {"and": [# IF THE REQUESTEE SWITCHED THE ? FLAG, THEN IT IS DONE {"term": {"attachments.flags.previous_status": "?"}}, {"not": {"term": {"attachments.flags.request_status": "?"}}}, {"equal": ["attachments.flags.modified_ts", "modified_ts"]} ]}, {"and": [# IF OBSOLETED THE ATTACHMENT, IT IS DONE {"term": {"attachments.isobsolete": 1}}, {"term": {"previous_values.isobsolete_value": 0}} ]}, {"and": [# SOME BUGS ARE CLOSED WITHOUT REMOVING REVIEW {"terms": {"bug_status": ["resolved", "verified", "closed"]}}, {"not": {"terms": {"previous_values.bug_status_value": ["resolved", "verified", "closed"]}}} ]} ]} ]} }) # SOME ATTACHMENTS GO MISSING, CLOSE THEM TOO closed_bugs = {b.bug_id: b for b in qb.filter(versions, {"and": [# SOME BUGS ARE CLOSED WITHOUT REMOVING REVIEW {"terms": {"bug_status": ["resolved", "verified", "closed"]}}, {"range": {"expires_on": {"gte": Date.now().milli}}} ]})} for s in starts: if s.bug_id in closed_bugs: e = closed_bugs[s.bug_id] ends.append({ "bug_id": e.bug_id, "bug_status": e.bug_status, "attach_id": s.attach_id, "modified_ts": e.modified_ts, "reviewer": s.reviewer, "request_type": s.request_type, "modified_by": e.modified_by, "product": e.product, "component": e.component, "review_end_reason": 'closed', "review_result": '?' }) # REVIEWS END WHEN REASSIGNED TO SOMEONE ELSE changes = qb.run({ "select": [ "bug_id", {"name": "attach_id", "value": "changes.attach_id"}, "modified_ts", {"name": "reviewer", "value": lambda r: r.changes.old_value.split("?")[1]}, {"name": "request_type", "value": lambda r: r.changes.old_value.split("?")[0]}, {"name": "modified_by", "value": "null"}, "product", "component", {"name": "review_end_reason", "value": "'reassigned'"} ], "from": versions, "where": {"and": [# ONLY LOOK FOR NAME CHANGES IN THE "review?" FIELD {"term": {"changes.field_name": "flags"}}, {"or": [{"prefix": {"changes.old_value": t + "?"}} for t in TYPES]} ]} }) ends.extend(changes) # PYTHON VERSION NOT CAPABLE OF THIS JOIN, YET # reviews = qb.run({ # "from": # starts, # "select": [ # {"name": "bug_status", "value": "bug_status", "aggregate": "one"}, # {"name": "review_time", "value": "doneReview.modified_ts", "aggregate": "minimum"}, # {"name": "review_result", "value": "doneReview.review_result", "aggregate": "minimum"}, # {"name": "product", "value": "coalesce(doneReview.product, product)", "aggregate": "minimum"}, # {"name": "component", "value": "coalesce(doneReview.component, component)", "aggregate": "minimum"}, # # {"name": "keywords", "value": "(coalesce(keywords, '')+' '+ETL.parseWhiteBoard(whiteboard)).trim()+' '+flags", "aggregate": "one"}, # {"name": "requester_review_num", "value": "-1", "aggregate": "one"} # ], # "analytic": [ # {"name": "is_first", "value": "rownum==0 ? 1 : 0", "sort": "request_time", "edges": ["bug_id"]} # ], # "edges": [ # "bug_id", # "attach_id", # {"name": "reviewer", "value": "requestee"}, # {"name": "requester", "value": "created_by"}, # {"name": "request_time", "value": "modified_ts"}, # { # "name": "doneReview", # "test": # "bug_id==doneReview.bug_id && " + # "attach_id==doneReview.attach_id && " + # "requestee==doneReview.requestee && " + # "!(bug_status=='closed' && doneReview.review_end_reason=='closed') && " + # "modified_ts<=doneReview.modified_ts", # "allowNulls": True, # "domain": {"type": "set", "key":["bug_id", "attach_id", "requestee", "modified_ts"], "partitions": ends} # } # ] # }) with Timer("match starts and ends for block {{start}}", {"start":min(*bugs)}): reviews = [] ends = Index(data=ends, keys=["bug_id", "attach_id", "request_type", "reviewer"]) for g, s in qb.groupby(starts, ["bug_id", "attach_id", "request_type", "reviewer"]): start_candidates = qb.sort(s, {"value": "request_time", "sort": 1}) end_candidates = qb.sort(ends[g], {"value": "modified_ts", "sort": 1}) #ZIP, BUT WITH ADDED CONSTRAINT s.modified_ts<=e.modified_ts if len(start_candidates) > 1: Log.note("many reviews on one attachment") ei = 0 for i, s in enumerate(start_candidates): while ei < len(end_candidates) and end_candidates[ei].modified_ts < coalesce(s.request_time, convert.datetime2milli(Date.MAX)): ei += 1 e = end_candidates[ei] s.review_time = e.modified_ts s.review_duration = e.modified_ts - s.request_time s.review_result = e.review_result s.review_end_reason = e.review_end_reason s.product = coalesce(e.product, s.product) s.component = coalesce(e.component, s.component) s.requester_review_num = -1 ei += 1 if s.bug_status == 'closed' and e.review_end_reason == 'closed': #reviews on closed bugs are ignored continue reviews.append(s) qb.run({ "from": reviews, "window": [{ "name": "is_first", "value": "rownum == 0", "edges": ["bug_id"], "sort": ["request_time"], "aggregate": "none" }] }) with Timer("add {{num}} reviews to ES for block {{start}}", {"start": min(*bugs), "num": len(reviews)}): sink.extend({"json": convert.value2json(r)} for r in reviews)