def get_private_bugs(es): """ FIND THE BUGS WE DO NOT EXPECT TO BE FOUND IN PUBLIC """ data = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"script": {"script": "true"}}, {"and": [{"exists": {"field": "bug_group"}}]} ]} }}, "from": 0, "size": 200000, "sort": [], "facets": {}, "fields": ["bug_id", "blocked", "dependson", "dupe_of", "dupe_by"] }) with Timer("aggregate es results on private bugs"): output = set([]) for bug in data.hits.hits: output.add(bug.fields.bug_id) output |= set(nvl(CNV.value2intlist(bug.fields.blocked), [])) output |= set(nvl(CNV.value2intlist(bug.fields.dependson), [])) output |= set(nvl(CNV.value2intlist(bug.fields.dupe_of), [])) output |= set(nvl(CNV.value2intlist(bug.fields.dupe_by), [])) output.add(551988, 636964) return output
def loadAliases(settings): try: try: with Timer("load alias file at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}): alias_json = File(settings.param.alias_file).read() except Exception, e: Log.warning("No alias file found (looking at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}) alias_json = "{}" #self.aliases IS A dict POINTING TO structs for k, v in CNV.JSON2object(alias_json).iteritems(): aliases[k] = struct.wrap(v) Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File( settings.param.first_run_time).exists and not File( settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es( settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl( settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write( unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def main(settings, bug_list=None, please_stop=None, restart=False): """ THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE. SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME. THIS CODE WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED OVER THE LIFETIME OF THE BUGZILLA DATA. 'PERSON' IS ABSTRACT, AND SIMPLY ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION """ if settings.args.quick: Log.note("Alias analysis skipped (--quick was used)") return if not restart: loadAliases(settings) if bug_list: with DB(settings.bugzilla, readonly=True) as db: data = get_all_cc_changes(db, bug_list) aggregator(data) analysis(settings, True, please_stop) return with DB(settings.bugzilla, readonly=True) as db: start = nvl(settings.param.start, 0) end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) #Perform analysis on blocks of bugs, in case we crash partway through for s, e in Q.intervals(start, end, settings.param.alias_increment): Log.note("Load range {{start}}-{{end}}", { "start": s, "end": e }) data = get_all_cc_changes(db, range(s, e)) if please_stop: break aggregator(data) analysis(settings, e >= end, please_stop)
def loadAliases(settings): try: try: with Timer( "load alias file at {{filename}}", { "filename": nvl(settings.param.alias_file.path, settings.param.alias_file) }): alias_json = File(settings.param.alias_file).read() except Exception, e: Log.warning( "No alias file found (looking at {{filename}}", { "filename": nvl(settings.param.alias_file.path, settings.param.alias_file) }) alias_json = "{}" #self.aliases IS A dict POINTING TO structs for k, v in CNV.JSON2object(alias_json).iteritems(): aliases[k] = struct.wrap(v) Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def main(settings, bug_list=None, please_stop=None, restart=False): """ THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE. SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME. THIS CODE WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED OVER THE LIFETIME OF THE BUGZILLA DATA. 'PERSON' IS ABSTRACT, AND SIMPLY ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION """ if settings.args.quick: Log.note("Alias analysis skipped (--quick was used)") return if not restart: loadAliases(settings) if bug_list: with DB(settings.bugzilla, readonly=True) as db: data = get_all_cc_changes(db, bug_list) aggregator(data) analysis(settings, True, please_stop) return with DB(settings.bugzilla, readonly=True) as db: start = nvl(settings.param.start, 0) end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) #Perform analysis on blocks of bugs, in case we crash partway through for s, e in Q.intervals(start, end, settings.param.alias_increment): Log.note("Load range {{start}}-{{end}}", {"start": s, "end": e}) data = get_all_cc_changes(db, range(s, e)) if please_stop: break aggregator(data) analysis(settings, e >= end, please_stop)
def get_all_bug_versions(es, bug_id, max_time=None): max_time = nvl(max_time, datetime.max) data = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"term": {"bug_id": bug_id}}, {"range": {"modified_ts": {"lte": CNV.datetime2milli(max_time)}}} ]} }}, "from": 0, "size": 200000, "sort": [] }) return Q.select(data.hits.hits, "_source")
def get(es, esfilter, fields=None, limit=None): query = struct.wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": esfilter }}, "from": 0, "size": nvl(limit, 200000), "sort": [] }) if fields: query.fields=fields results = es.search(query) return Q.select(results.hits.hits, "fields") else: results = es.search(query) return Q.select(results.hits.hits, "_source")
def add_alias(lost, found): found_record = aliases.get(found, None) lost_record = aliases.get(lost, None) new_canonical = found old_canonical = nvl(lost_record.canonical, lost) lost_record.canonical = new_canonical delete_list = [] #FOLD bugs ON lost=found for bug_id, agg in bugs.iteritems(): v = agg.dic.get(lost, 0) if v != 0: agg.add(lost, -v) agg.add(found, v) if not agg: delete_list.append(bug_id) #FOLD bugs ON old_canonical=new_canonical if old_canonical != lost: for bug_id, agg in bugs.iteritems(): v = agg.dic.get(old_canonical, 0) if v != 0: agg.add(old_canonical, -v) agg.add(new_canonical, v) if not agg: delete_list.append(bug_id) for d in delete_list: del bugs[d] #FOLD ALIASES for k, v in aliases.iteritems(): if v.canonical == old_canonical: Log.note( "ALIAS REMAPPED: {{alias}}->{{old}} to {{alias}}->{{new}}", { "alias": k, "old": old_canonical, "new": found }) v.canonical = found
def add_alias(lost, found): found_record = aliases.get(found, None) lost_record = aliases.get(lost, None) new_canonical = found old_canonical = nvl(lost_record.canonical, lost) lost_record.canonical = new_canonical delete_list = [] #FOLD bugs ON lost=found for bug_id, agg in bugs.iteritems(): v = agg.dic.get(lost, 0) if v != 0: agg.add(lost, -v) agg.add(found, v) if not agg: delete_list.append(bug_id) #FOLD bugs ON old_canonical=new_canonical if old_canonical != lost: for bug_id, agg in bugs.iteritems(): v = agg.dic.get(old_canonical, 0) if v != 0: agg.add(old_canonical, -v) agg.add(new_canonical, v) if not agg: delete_list.append(bug_id) for d in delete_list: del bugs[d] #FOLD ALIASES for k, v in aliases.iteritems(): if v.canonical == old_canonical: Log.note("ALIAS REMAPPED: {{alias}}->{{old}} to {{alias}}->{{new}}", { "alias": k, "old": old_canonical, "new": found }) v.canonical = found
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File(settings.param.first_run_time).exists and not File(settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es(settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl(settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write(unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def get(es, esfilter, fields=None, limit=None): query = struct.wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": esfilter } }, "from": 0, "size": nvl(limit, 200000), "sort": [] }) if fields: query.fields = fields results = es.search(query) return Q.select(results.hits.hits, "fields") else: results = es.search(query) return Q.select(results.hits.hits, "_source")
def main(settings): #USE A FILE if settings.source.filename != None: settings.destination.alias = settings.destination.index settings.destination.index = ElasticSearch.proto_name(settings.destination.alias) schema = CNV.JSON2object(File(settings.source.schema_filename).read()) if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = CNV.JSON2object(CNV.object2JSON(schema).replace("attachments_", "attachments.")) dest = ElasticSearch.create_index(settings.destination, schema, limit_replicas=True) dest.set_refresh_interval(-1) extract_from_file(settings.source, dest) dest.set_refresh_interval(1) dest.delete_all_but(settings.destination.alias, settings.destination.index) dest.add_alias(settings.destination.alias) return # SYNCH WITH source ES INDEX source=ElasticSearch(settings.source) destination=get_or_create_index(settings["destination"], source) # GET LAST UPDATED time_file = File(settings.param.last_replication_time) from_file = None if time_file.exists: from_file = CNV.milli2datetime(CNV.value2int(time_file.read())) from_es = get_last_updated(destination) last_updated = nvl(MIN(from_file, from_es), CNV.milli2datetime(0)) current_time = datetime.utcnow() pending = get_pending(source, last_updated) with ThreadedQueue(destination, size=1000) as data_sink: replicate(source, data_sink, pending, last_updated) # RECORD LAST UPDATED time_file.write(unicode(CNV.datetime2milli(current_time)))
def alias(email): output = nvl(aliases.get(email, Null).canonical, email) return output
def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue): with Thread.run("alias_analysis", alias_analysis.main, settings=settings): end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) start = nvl(settings.param.start, 0) if resume_from_last_run: start = nvl( settings.param.start, Math.floor(get_max_bug_id(es), settings.param.increment)) ############################################################# ## MAIN ETL LOOP ############################################################# #TWO WORKERS IS MORE THAN ENOUGH FOR A SINGLE THREAD # with Multithread([run_both_etl, run_both_etl]) as workers: for min, max in Q.intervals(start, end, settings.param.increment): if settings.args.quick and min < end - settings.param.increment and min != 0: #--quick ONLY DOES FIRST AND LAST BLOCKS continue try: #GET LIST OF CHANGED BUGS with Timer("time to get {{min}}..{{max}} bug list", { "min": min, "max": max }): if param.allow_private_bugs: bug_list = Q.select( db.query( """ SELECT b.bug_id FROM bugs b WHERE delta_ts >= {{start_time_str}} AND ({{min}} <= b.bug_id AND b.bug_id < {{max}}) """, { "min": min, "max": max, "start_time_str": param.start_time_str }), u"bug_id") else: bug_list = Q.select( db.query( """ SELECT b.bug_id FROM bugs b LEFT JOIN bug_group_map m ON m.bug_id=b.bug_id WHERE delta_ts >= {{start_time_str}} AND ({{min}} <= b.bug_id AND b.bug_id < {{max}}) AND m.bug_id IS NULL """, { "min": min, "max": max, "start_time_str": param.start_time_str }), u"bug_id") if not bug_list: continue param.bug_list = bug_list run_both_etl( **{ "db": db, "output_queue": output_queue, "es_comments": es_comments, "param": param.copy() }) except Exception, e: Log.error( "Problem with dispatch loop in range [{{min}}, {{max}})", { "min": min, "max": max }, e)
def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue): with Thread.run("alias_analysis", alias_analysis.main, settings=settings): end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) start = nvl(settings.param.start, 0) if resume_from_last_run: start = nvl(settings.param.start, Math.floor(get_max_bug_id(es), settings.param.increment)) ############################################################# ## MAIN ETL LOOP ############################################################# #TWO WORKERS IS MORE THAN ENOUGH FOR A SINGLE THREAD # with Multithread([run_both_etl, run_both_etl]) as workers: for min, max in Q.intervals(start, end, settings.param.increment): if settings.args.quick and min < end - settings.param.increment and min != 0: #--quick ONLY DOES FIRST AND LAST BLOCKS continue try: #GET LIST OF CHANGED BUGS with Timer("time to get {{min}}..{{max}} bug list", {"min":min, "max":max}): if param.allow_private_bugs: bug_list = Q.select(db.query(""" SELECT b.bug_id FROM bugs b WHERE delta_ts >= {{start_time_str}} AND ({{min}} <= b.bug_id AND b.bug_id < {{max}}) """, { "min": min, "max": max, "start_time_str": param.start_time_str }), u"bug_id") else: bug_list = Q.select(db.query(""" SELECT b.bug_id FROM bugs b LEFT JOIN bug_group_map m ON m.bug_id=b.bug_id WHERE delta_ts >= {{start_time_str}} AND ({{min}} <= b.bug_id AND b.bug_id < {{max}}) AND m.bug_id IS NULL """, { "min": min, "max": max, "start_time_str": param.start_time_str }), u"bug_id") if not bug_list: continue param.bug_list = bug_list run_both_etl(**{ "db": db, "output_queue": output_queue, "es_comments": es_comments, "param": param.copy() }) except Exception, e: Log.error("Problem with dispatch loop in range [{{min}}, {{max}})", { "min": min, "max": max }, e)