def test_datetime(self): result = convert.datetime2milli(datetime.datetime(2012, 7, 24)) expected = 1343088000000 assert result == expected result = convert.datetime2milli(datetime.date(2012, 7, 24)) expected = 1343088000000 assert result == expected result = convert.datetime2milli(datetime.datetime(2014, 1, 7, 10, 21, 0)) expected = 1389090060000 assert result == expected result = text(convert.datetime2milli(datetime.datetime(2014, 1, 7, 10, 21, 0))) expected = u"1389090060000" assert result == expected
def test_datetime(self): result = convert.datetime2milli(datetime.datetime(2012, 07, 24)) expected = 1343088000000 assert result == expected result = convert.datetime2milli(datetime.date(2012, 07, 24)) expected = 1343088000000 assert result == expected result = convert.datetime2milli(datetime.datetime(2014, 01, 07, 10, 21, 00)) expected = 1389090060000 assert result == expected result = unicode(convert.datetime2milli(datetime.datetime(2014, 01, 07, 10, 21, 00))) expected = u"1389090060000" assert result == expected
def value2value(value): """ CONVERT FROM PYTHON VALUE TO ES EQUIVALENT """ if isinstance(value, datetime): return convert.datetime2milli(value) if isinstance(value, Duration): return value.milli # DURATION return value
def value2query(value): if isinstance(value, datetime): return convert.datetime2milli(value) if isinstance(value, Duration): return value.milli if Math.is_number(value): return value return convert.string2quote(value)
def full_etl(settings): schema = convert.json2value(convert.value2json(SCHEMA), leaves=True) Cluster(settings.destination).get_or_create_index(settings=settings.destination, schema=schema, limit_replicas=True) destq = FromES(settings.destination) if settings.incremental: min_bug_id = destq.query({ "from": coalesce(settings.destination.alias, settings.destination.index), "select": {"name": "max_bug_id", "value": "bug_id", "aggregate": "max"} }) min_bug_id = int(MAX(min_bug_id-1000, 0)) else: min_bug_id = 0 sourceq = FromES(settings.source) max_bug_id = sourceq.query({ "from": coalesce(settings.source.alias, settings.source.index), "select": {"name": "max_bug_id", "value": "bug_id", "aggregate": "max"} }) + 1 max_bug_id = int(coalesce(max_bug_id, 0)) # FIRST, GET ALL MISSING BUGS for s, e in qb.reverse(list(qb.intervals(min_bug_id, max_bug_id, 10000))): with Timer("pull {{start}}..{{end}} from ES", {"start": s, "end": e}): children = sourceq.query({ "from": settings.source.alias, "select": ["bug_id", "dependson", "blocked", "modified_ts", "expires_on"], "where": {"and": [ {"range": {"bug_id": {"gte": s, "lt": e}}}, {"or": [ {"exists": "dependson"}, {"exists": "blocked"} ]} ]}, "limit": 10000 }) with Timer("fixpoint work"): to_fix_point(settings, destq, children.data) # PROCESS RECENT CHANGES with Timer("pull recent dependancies from ES"): children = sourceq.query({ "from": settings.source.alias, "select": ["bug_id", "dependson", "blocked"], "where": {"and": [ {"range": {"modified_ts": {"gte": convert.datetime2milli(datetime.utcnow() - timedelta(days=7))}}}, {"or": [ {"exists": "dependson"}, {"exists": "blocked"} ]} ]}, "limit": 100000 }) to_fix_point(settings, destq, children.data)
def value2MVEL(value): """ FROM PYTHON VALUE TO MVEL EQUIVALENT """ if isinstance(value, datetime): return str(convert.datetime2milli(value)) + " /*" + value.format("yyNNNdd HHmmss") + "*/" # TIME if isinstance(value, Duration): return str(convert.timedelta2milli(value)) + " /*" + str(value) + "*/" # DURATION if Math.is_number(value): return str(value) return convert.string2quote(value)
def value2MVEL(value): """ FROM PYTHON VALUE TO MVEL EQUIVALENT """ if isinstance(value, datetime): return str(convert.datetime2milli(value)) + " /*" + value.format("yyNNNdd HHmmss") + "*/" # TIME if isinstance(value, Duration): return str(convert.timedelta2milli(value)) + " /*" + str(value) + "*/" # DURATION if Math.is_number(value): return str(value) return quote(value)
def get_pending(es, since): result = es.search({ "query": {"match_all": {}}, "from": 0, "size": 0, "sort": [], "facets": {"default": {"statistical": {"field": "bug_id"}}} }) max_bug = int(result.facets.default.max) pending_bugs = None for s, e in qb.intervals(0, max_bug + 1, 100000): Log.note("Collect history for bugs from {{start}}..{{end}}", {"start": s, "end": e}) result = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"range": {"modified_ts": {"gte": convert.datetime2milli(since)}}}, {"range": {"bug_id": {"gte": s, "lte": e}}} ]} }}, "from": 0, "size": 0, "sort": [], "facets": {"default": {"terms": {"field": "bug_id", "size": 200000}}} }) temp = Multiset( result.facets.default.terms, key_field="term", count_field="count" ) if pending_bugs is None: pending_bugs = temp else: pending_bugs = pending_bugs + temp Log.note("Source has {{num}} bug versions for updating", { "num": len(pending_bugs) }) return pending_bugs
def get_last_updated(es): try: results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": { "range": { "modified_ts": {"gte": convert.datetime2milli(far_back)} }} }}, "from": 0, "size": 0, "sort": [], "facets": {"modified_ts": {"statistical": {"field": "modified_ts"}}} }) if results.facets.modified_ts.count == 0: return convert.milli2datetime(0) return convert.milli2datetime(results.facets.modified_ts.max) except Exception, e: return convert.milli2datetime(0)
def main(settings): current_time = datetime.utcnow() time_file = File(settings.param.last_replication_time) # SYNCH WITH source ES INDEX source = Index(settings.source) destination = Cluster(settings.destination).get_or_create_index(settings.destination) # GET LAST UPDATED from_file = None if time_file.exists: from_file = convert.milli2datetime(convert.value2int(time_file.read())) from_es = get_last_updated(destination) - timedelta(hours=1) last_updated = MIN(coalesce(from_file, convert.milli2datetime(0)), from_es) Log.note("updating records with modified_ts>={{last_updated}}", {"last_updated": last_updated}) pending = get_pending(source, last_updated) with ThreadedQueue(destination, batch_size=1000) as data_sink: replicate(source, data_sink, pending, last_updated) # RECORD LAST UPDATED time_file.write(unicode(convert.datetime2milli(current_time)))
def full_etl(settings, sink, bugs): with Timer("process block {{start}}", {"start": min(bugs)}): es = elasticsearch.Index(settings.source) with FromES(es) as esq: versions = esq.query({ "from": "bugs", "select": "*", "where": {"terms": {"bug_id": bugs}} }) starts = qb.run({ "select": [ "bug_id", "bug_status", {"name": "attach_id", "value": "attachments.attach_id"}, {"name": "request_time", "value": "modified_ts"}, {"name": "request_type", "value": "attachments.flags.request_type"}, {"name": "reviewer", "value": "attachments.flags.requestee"}, {"name": "created_by", "value": "attachments.created_by"}, "product", "component" ], "from": versions, "where": {"and": [ {"terms": {"attachments.flags.request_status": ["?"]}}, {"terms": {"attachments.flags.request_type": TYPES}}, {"equal": ["attachments.flags.modified_ts", "modified_ts"]}, {"term": {"attachments.isobsolete": 0}} ]}, "sort": ["bug_id", "attach_id", "created_by"] }) ends = qb.run({ "select": [ {"name": "bug_id", "value": "bug_id"}, "bug_status", {"name": "attach_id", "value": "attachments.attach_id"}, {"name": "modified_ts", "value": lambda r: Math.max(r.modified_ts, r.attachments.modified_ts, r.attachments.flags.modified_ts)}, {"name": "reviewer", "value": "attachments.flags.requestee"}, {"name": "request_type", "value": "attachments.flags.request_type"}, {"name": "modified_by", "value": "attachments.flags.modified_by"}, {"name": "product", "value": "product"}, {"name": "component", "value": "component"}, {"name": "review_end_reason", "value": lambda r: 'done' if r.attachments.flags.request_status != '?' else ('obsolete' if r.attachments.isobsolete == 1 else 'closed')}, {"name": "review_result", "value": lambda r: '+' if r.attachments.flags.request_status == '+' else ('-' if r.attachments.flags.request_status == '-' else '?')} ], "from": versions, "where": {"and": [ {"terms": {"attachments.flags.request_type": TYPES}}, {"or": [ {"and": [# IF THE REQUESTEE SWITCHED THE ? FLAG, THEN IT IS DONE {"term": {"attachments.flags.previous_status": "?"}}, {"not": {"term": {"attachments.flags.request_status": "?"}}}, {"equal": ["attachments.flags.modified_ts", "modified_ts"]} ]}, {"and": [# IF OBSOLETED THE ATTACHMENT, IT IS DONE {"term": {"attachments.isobsolete": 1}}, {"term": {"previous_values.isobsolete_value": 0}} ]}, {"and": [# SOME BUGS ARE CLOSED WITHOUT REMOVING REVIEW {"terms": {"bug_status": ["resolved", "verified", "closed"]}}, {"not": {"terms": {"previous_values.bug_status_value": ["resolved", "verified", "closed"]}}} ]} ]} ]} }) # SOME ATTACHMENTS GO MISSING, CLOSE THEM TOO closed_bugs = {b.bug_id: b for b in qb.filter(versions, {"and": [# SOME BUGS ARE CLOSED WITHOUT REMOVING REVIEW {"terms": {"bug_status": ["resolved", "verified", "closed"]}}, {"range": {"expires_on": {"gte": Date.now().milli}}} ]})} for s in starts: if s.bug_id in closed_bugs: e = closed_bugs[s.bug_id] ends.append({ "bug_id": e.bug_id, "bug_status": e.bug_status, "attach_id": s.attach_id, "modified_ts": e.modified_ts, "reviewer": s.reviewer, "request_type": s.request_type, "modified_by": e.modified_by, "product": e.product, "component": e.component, "review_end_reason": 'closed', "review_result": '?' }) # REVIEWS END WHEN REASSIGNED TO SOMEONE ELSE changes = qb.run({ "select": [ "bug_id", {"name": "attach_id", "value": "changes.attach_id"}, "modified_ts", {"name": "reviewer", "value": lambda r: r.changes.old_value.split("?")[1]}, {"name": "request_type", "value": lambda r: r.changes.old_value.split("?")[0]}, {"name": "modified_by", "value": "null"}, "product", "component", {"name": "review_end_reason", "value": "'reassigned'"} ], "from": versions, "where": {"and": [# ONLY LOOK FOR NAME CHANGES IN THE "review?" FIELD {"term": {"changes.field_name": "flags"}}, {"or": [{"prefix": {"changes.old_value": t + "?"}} for t in TYPES]} ]} }) ends.extend(changes) # PYTHON VERSION NOT CAPABLE OF THIS JOIN, YET # reviews = qb.run({ # "from": # starts, # "select": [ # {"name": "bug_status", "value": "bug_status", "aggregate": "one"}, # {"name": "review_time", "value": "doneReview.modified_ts", "aggregate": "minimum"}, # {"name": "review_result", "value": "doneReview.review_result", "aggregate": "minimum"}, # {"name": "product", "value": "coalesce(doneReview.product, product)", "aggregate": "minimum"}, # {"name": "component", "value": "coalesce(doneReview.component, component)", "aggregate": "minimum"}, # # {"name": "keywords", "value": "(coalesce(keywords, '')+' '+ETL.parseWhiteBoard(whiteboard)).trim()+' '+flags", "aggregate": "one"}, # {"name": "requester_review_num", "value": "-1", "aggregate": "one"} # ], # "analytic": [ # {"name": "is_first", "value": "rownum==0 ? 1 : 0", "sort": "request_time", "edges": ["bug_id"]} # ], # "edges": [ # "bug_id", # "attach_id", # {"name": "reviewer", "value": "requestee"}, # {"name": "requester", "value": "created_by"}, # {"name": "request_time", "value": "modified_ts"}, # { # "name": "doneReview", # "test": # "bug_id==doneReview.bug_id && " + # "attach_id==doneReview.attach_id && " + # "requestee==doneReview.requestee && " + # "!(bug_status=='closed' && doneReview.review_end_reason=='closed') && " + # "modified_ts<=doneReview.modified_ts", # "allowNulls": True, # "domain": {"type": "set", "key":["bug_id", "attach_id", "requestee", "modified_ts"], "partitions": ends} # } # ] # }) with Timer("match starts and ends for block {{start}}", {"start":min(*bugs)}): reviews = [] ends = Index(data=ends, keys=["bug_id", "attach_id", "request_type", "reviewer"]) for g, s in qb.groupby(starts, ["bug_id", "attach_id", "request_type", "reviewer"]): start_candidates = qb.sort(s, {"value": "request_time", "sort": 1}) end_candidates = qb.sort(ends[g], {"value": "modified_ts", "sort": 1}) #ZIP, BUT WITH ADDED CONSTRAINT s.modified_ts<=e.modified_ts if len(start_candidates) > 1: Log.note("many reviews on one attachment") ei = 0 for i, s in enumerate(start_candidates): while ei < len(end_candidates) and end_candidates[ei].modified_ts < coalesce(s.request_time, convert.datetime2milli(Date.MAX)): ei += 1 e = end_candidates[ei] s.review_time = e.modified_ts s.review_duration = e.modified_ts - s.request_time s.review_result = e.review_result s.review_end_reason = e.review_end_reason s.product = coalesce(e.product, s.product) s.component = coalesce(e.component, s.component) s.requester_review_num = -1 ei += 1 if s.bug_status == 'closed' and e.review_end_reason == 'closed': #reviews on closed bugs are ignored continue reviews.append(s) qb.run({ "from": reviews, "window": [{ "name": "is_first", "value": "rownum == 0", "edges": ["bug_id"], "sort": ["request_time"], "aggregate": "none" }] }) with Timer("add {{num}} reviews to ES for block {{start}}", {"start": min(*bugs), "num": len(reviews)}): sink.extend({"json": convert.value2json(r)} for r in reviews)
def replicate(source, destination, pending, last_updated): """ COPY THE DEPENDENCY RECORDS TO THE destination NOTE THAT THE PUBLIC CLUSTER HAS HOLES, SO WE USE blocked TO FILL THEM """ for g, bugs in qb.groupby(pending, max_size=BATCH_SIZE): with Timer("Replicate {{num_bugs}} bug versions", {"num_bugs": len(bugs)}): data = source.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"terms": {"bug_id": set(bugs)}}, {"range": {"expires_on": {"gte": convert.datetime2milli(last_updated)}}}, {"or": [ {"exists": {"field": "dependson"}}, {"exists": {"field": "blocked"}} ]} ]} }}, "from": 0, "size": 200000, "sort": [], "fields": ["bug_id", "modified_ts", "expires_on", "dependson", "blocked"] }) with Timer("Push to destination"): d2 = [ { "id": str(x.bug_id) + "_" + str(x.modified_ts)[:-3], "value": { "bug_id": x.bug_id, "modified_ts": x.modified_ts, "expires_on": x.expires_on, "dependson": x.dependson } } for x in data.hits.hits.fields if x.dependson ] destination.extend(d2) with Timer("filter"): d4 = qb.run({ "from": data.hits.hits.fields, "where": {"exists": {"field": "blocked"}} }) with Timer("select"): d3 = qb.run({ "from": d4, "select": [ {"name": "bug_id", "value": "blocked."}, # SINCE blocked IS A LIST, ARE SELECTING THE LIST VALUES, AND EFFECTIVELY PERFORMING A JOIN "modified_ts", "expires_on", {"name": "dependson", "value": "bug_id"} ] }) with Timer("Push to destination"): destination.extend([ { "id": str(x.bug_id) + "_" + str(x.dependson) + "_" + str(x.modified_ts)[:-3], "value": x } for x in d3 if x.dependson ])