def normalize(bug): bug.id = str(bug.bug_id) + "_" + str(bug.modified_ts)[:-3] bug._id = None #ENSURE STRUCTURES ARE SORTED # Do some processing to make sure that diffing between runs stays as similar as possible. bug.flags = Q.sort(bug.flags, "value") if bug.attachments is not None: bug.attachments = Q.sort(bug.attachments, "attach_id") for a in bug.attachments: a.flags = Q.sort(a.flags, "value") bug.changes = Q.sort(bug.changes, "field_name") #bug IS CONVERTED TO A 'CLEAN' COPY bug = scrub(bug) for f in NUMERIC_FIELDS: v = bug[f] if v is None: continue if f in MULTI_FIELDS: bug[f] = CNV.value2intlist(v) elif v == 0: del bug[f] # Also reformat some date fields for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]: v = bug[dateField] if v is None: continue try: if isinstance(v, datetime): bug[dateField] = CNV.datetime2milli(v) elif DATE_PATTERN_STRICT.match(v): # Convert to "2012/01/01 00:00:00.000" # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v + "000", "%Y/%m/%d %H:%M%:S%f")) elif DATE_PATTERN_STRICT_SHORT.match(v): # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp. # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S")) elif DATE_PATTERN_RELAXED.match(v): # Convert "2012/01/01 00:00:00.000" to "2012-01-01" # Example: bug 643420 (deadline) # bug 726635 (cf_due_date) bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v[0:10], "%Y-%m-%d")) except Exception, e: D.error("problem with converting date to milli (value={{value}})", {"value": bug[dateField]}, e)
def normalize(bug): bug.id=str(bug.bug_id)+"_"+str(bug.modified_ts)[:-3] bug._id=None #ENSURE STRUCTURES ARE SORTED # Do some processing to make sure that diffing between runs stays as similar as possible. bug.flags=Q.sort(bug.flags, "value") if bug.attachments is not None: bug.attachments=Q.sort(bug.attachments, "attach_id") for a in bug.attachments: a.flags=Q.sort(a.flags, "value") bug.changes=Q.sort(bug.changes, "field_name") #bug IS CONVERTED TO A 'CLEAN' COPY bug=scrub(bug) for f in NUMERIC_FIELDS: v=bug[f] if v is None: continue if f in MULTI_FIELDS: bug[f]=CNV.value2intlist(v) elif v==0: del bug[f] # Also reformat some date fields for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]: v=bug[dateField] if v is None: continue try: if isinstance(v, datetime): bug[dateField] = CNV.datetime2milli(v) elif DATE_PATTERN_STRICT.match(v): # Convert to "2012/01/01 00:00:00.000" # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f")) elif DATE_PATTERN_STRICT_SHORT.match(v): # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp. # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S")) elif DATE_PATTERN_RELAXED.match(v): # Convert "2012/01/01 00:00:00.000" to "2012-01-01" # Example: bug 643420 (deadline) # bug 726635 (cf_due_date) bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v[0:10], "%Y-%m-%d")) except Exception, e: D.error("problem with converting date to milli (value={{value}})", {"value":bug[dateField]}, e)
def get_pending(es, since): result=es.search({ "query":{"filtered":{ "query":{"match_all":{}}, "filter":{"range":{"modified_ts":{"gte":CNV.datetime2milli(since)}}} }}, "from":0, "size":0, "sort":[], "facets":{"default":{"terms":{"field":"bug_id","size":200000}}} }) if len(result.facets.default.terms)>=200000: D.error("Can not handle more than 200K bugs changed") pending_bugs=multiset(result.facets.default.terms, key_field="term", count_field="count") return pending_bugs
def get_last_updated(es): try: results=es.search({ "query":{"filtered":{ "query":{"match_all":{}}, "filter":{"range":{"modified_ts":{"gte":CNV.datetime2milli(far_back)}}} }}, "from":0, "size":0, "sort":[], "facets":{"0":{"statistical":{"field":"modified_ts"}}} }) if results.facets["0"].count==0: return datetime.min; return CNV.milli2datetime(results.facets["0"].max) except Exception, e: D.error("Can not get_last_updated from {{host}}/{{index}}", {"host":es.settings.host, "index":es.settings.index}, e)
def main(settings): #USE A FILE if settings.source.filename is not None: settings.destination.alias=settings.destination.index settings.destination.index=settings.destination.alias+CNV.datetime2string(datetime.utcnow(), "%Y%m%d_%H%M%S") schema=CNV.JSON2object(File(settings.source.schema_filename).read()) dest=ElasticSearch.create_index(settings.destination, schema) dest.set_refresh_interval(-1) extract_from_file(settings.source, dest) dest.set_refresh_interval(1) dest.delete_all_but(settings.destination.alias, settings.destination.index) dest.add_alias(settings.destination.alias) return # SYNCH WITH source ES INDEX source=ElasticSearch(settings.source) destination=get_or_create_index(settings["destination"], source) last_updated=get_last_updated(destination)-timedelta(days=7) pending=get_pending(source, last_updated) # pending IS IN {"bug_id":b, "count":c} FORM # MAIN ETL LOOP for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE): data=source.search({ "query":{"filtered":{ "query":{"match_all":{}}, "filter":{"and":[ {"terms":{"bug_id":bugs}}, {"range":{"modified_ts":{"gte":CNV.datetime2milli(last_updated)}}} ]} }}, "from":0, "size":200000, "sort":[] }) d2=map( lambda(x): {"id":x.id, "value":x}, map( lambda(x): transform_bugzilla.normalize(transform_bugzilla.rename_attachments(x)), data.hits.hits ) ) destination.add(d2)
def get_pending(es, since): result = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "range": { "modified_ts": { "gte": CNV.datetime2milli(since) } } } } }, "from": 0, "size": 0, "sort": [], "facets": { "default": { "terms": { "field": "bug_id", "size": 200000 } } } }) if len(result.facets.default.terms) >= 200000: D.error("Can not handle more than 200K bugs changed") pending_bugs = multiset(result.facets.default.terms, key_field="term", count_field="count") return pending_bugs
def get_last_updated(es): try: results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "range": { "modified_ts": { "gte": CNV.datetime2milli(far_back) } } } } }, "from": 0, "size": 0, "sort": [], "facets": { "0": { "statistical": { "field": "modified_ts" } } } }) if results.facets["0"].count == 0: return datetime.min return CNV.milli2datetime(results.facets["0"].max) except Exception, e: D.error("Can not get_last_updated from {{host}}/{{index}}", { "host": es.settings.host, "index": es.settings.index }, e)
def main(settings): #USE A FILE if settings.source.filename is not None: settings.destination.alias = settings.destination.index settings.destination.index = settings.destination.alias + CNV.datetime2string( datetime.utcnow(), "%Y%m%d_%H%M%S") schema = CNV.JSON2object(File(settings.source.schema_filename).read()) dest = ElasticSearch.create_index(settings.destination, schema) dest.set_refresh_interval(-1) extract_from_file(settings.source, dest) dest.set_refresh_interval(1) dest.delete_all_but(settings.destination.alias, settings.destination.index) dest.add_alias(settings.destination.alias) return # SYNCH WITH source ES INDEX source = ElasticSearch(settings.source) destination = get_or_create_index(settings["destination"], source) last_updated = get_last_updated(destination) - timedelta(days=7) pending = get_pending(source, last_updated) # pending IS IN {"bug_id":b, "count":c} FORM # MAIN ETL LOOP for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE): data = source.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "terms": { "bug_id": bugs } }, { "range": { "modified_ts": { "gte": CNV.datetime2milli(last_updated) } } }] } } }, "from": 0, "size": 200000, "sort": [] }) d2 = map( lambda (x): { "id": x.id, "value": x }, map( lambda (x): transform_bugzilla.normalize( transform_bugzilla.rename_attachments(x)), data.hits.hits)) destination.add(d2)