def get_recent_private_comments(db, param): """ GET COMMENTS THAT HAVE HAD THEIR PRIVACY INDICATOR CHANGED """ if param.allow_private_bugs: return [] param.field_id = PRIVATE_COMMENTS_FIELD_ID try: comments = db.query( """ SELECT a.comment_id, a.bug_id FROM bugs_activity a WHERE bug_when >= {{start_time_str}} AND fieldid={{field_id}} """, param) return comments except Exception, e: Log.error("problem getting recent private attachments", e)
def start(): try: settings=startup.read_settings() Log.start(settings.debug) main(settings) except Exception, e: Log.error("Problems exist", e)
def test_private_etl(self): """ ENSURE IDENTIFIABLE INFORMATION DOES NOT EXIST ON ANY BUGS """ File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() self.settings.param.allow_private_bugs = True database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.fake.bugs) es_comments = elasticsearch.make_test_instance( "candidate_comments", self.settings.fake.comments) bz_etl.main(self.settings, es, es_comments) ref = elasticsearch.open_test_instance( "reference", self.settings.private_bugs_reference) compare_both(es, ref, self.settings, self.settings.param.bugs) #DIRECT COMPARE THE FILE JSON can = File(self.settings.fake.comments.filename).read() ref = File(self.settings.private_comments_reference.filename).read() if can != ref: for i, c in enumerate(can): found = -1 if can[i] != ref[i]: found = i break Log.error("Comments do not match reference\n{{sample}}", {"sample": can[MIN([0, found - 100]):found + 100]})
def get_max_bug_id(es): try: results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "script": { "script": "true" } } } }, "from": 0, "size": 0, "sort": [], "facets": { "0": { "statistical": { "field": "bug_id" } } } }) if results.facets["0"].count == 0: return 0 return results.facets["0"].max except Exception, e: Log.error("Can not get_max_bug from {{host}}/{{index}}", { "host": es.settings.host, "index": es.settings.index }, e)
def test_public_etl(self): """ ENSURE ETL GENERATES WHAT'S IN THE REFERENCE FILE """ File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() self.settings.param.allow_private_bugs = Null database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.fake.bugs) es_comments = elasticsearch.make_test_instance("candidate_comments", self.settings.fake.comments) bz_etl.main(self.settings, es, es_comments) ref = elasticsearch.open_test_instance("reference", self.settings.public_bugs_reference) compare_both(es, ref, self.settings, self.settings.param.bugs) #DIRECT COMPARE THE FILE JSON can = File(self.settings.fake.comments.filename).read() ref = File(self.settings.public_comments_reference.filename).read() if can != ref: found = -1 for i, c in enumerate(can): if can[i] != ref[i]: found = i break Log.error("Comments do not match reference\n{{sample}}", {"sample": can[MIN(0, found - 100):found + 100:]})
def test_public_etl(self): """ ENSURE ETL GENERATES WHAT'S IN THE REFERENCE FILE """ File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() self.settings.param.allow_private_bugs = Null database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.fake.bugs) es_comments = elasticsearch.make_test_instance( "candidate_comments", self.settings.fake.comments) bz_etl.main(self.settings, es, es_comments) ref = elasticsearch.open_test_instance( "reference", self.settings.public_bugs_reference) compare_both(es, ref, self.settings, self.settings.param.bugs) #DIRECT COMPARE THE FILE JSON can = File(self.settings.fake.comments.filename).read() ref = File(self.settings.public_comments_reference.filename).read() if can != ref: found = -1 for i, c in enumerate(can): if can[i] != ref[i]: found = i break Log.error("Comments do not match reference\n{{sample}}", {"sample": can[MIN(0, found - 100):found + 100:]})
def verify_public_bugs(es, private_bugs): #VERIFY BUGS ARE IN OUTPUT for b in private_bugs: versions = compare_es.get_all_bug_versions(es, b) if not versions: Log.error("Expecting versions for public bug {{bug_id}}", {"bug_id": b})
def test_whiteboard_screened(self): GOOD_BUG_TO_TEST = 1046 database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #MARK BUG AS ONE OF THE SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0]) db.flush() #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([ GOOD_BUG_TO_TEST ]) # bug 1046 sees lots of whiteboard, and other field, changes param.allow_private_bugs = True with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST) for v in versions: if v.status_whiteboard not in (None, "", "[screened]"): Log.error("Expecting whiteboard to be screened")
def diff(db, table, old_record, new_record): """ UPDATE bugs_activity WITH THE CHANGES IN RECORDS """ now = milli2string(db, CNV.datetime2milli(get_current_time(db))) changed = set(old_record.keys()) ^ set(new_record.keys()) changed |= set([k for k, v in old_record.items() if v != new_record[k]]) if table != u"bugs": prefix = table + u"." else: prefix = u"" for c in changed: fieldid=db.query("SELECT id FROM fielddefs WHERE name={{field_name}}", {"field_name": prefix + c})[0].id if fieldid == None: Log.error("Expecting a valid field name") activity = Struct( bug_id=old_record.bug_id, who=1, bug_when=now, fieldid=fieldid, removed=old_record[c], added=new_record[c], attach_id=old_record.attach_id, comment_id=old_record.comment_id ) db.insert("bugs_activity", activity) db.execute("UPDATE bugs SET delta_ts={{now}} WHERE {{where}}", { "now":now, "where":esfilter2sqlwhere(db, {"term":{"bug_id":old_record.bug_id}}) })
def get_comments_by_id(db, comments, param): """ GET SPECIFIC COMMENTS """ if param.allow_private_bugs: return [] param.comments_filter = esfilter2sqlwhere(db, {"and": [ {"term": {"isprivate": 0}}, {"terms": {"c.comment_id": comments}} ]}) try: comments = db.query(""" SELECT c.comment_id, c.bug_id, p.login_name modified_by, UNIX_TIMESTAMP(CONVERT_TZ(bug_when, 'US/Pacific','UTC'))*1000 AS modified_ts, c.thetext comment, c.isprivate FROM longdescs c LEFT JOIN profiles p ON c.who = p.userid LEFT JOIN longdescs_tags t ON t.comment_id=c.comment_id AND t.tag <> 'deleted' WHERE {{comments_filter}} """, param) return comments except Exception, e: Log.error("can not get comment data", e)
def start(): try: settings = startup.read_settings() Log.start(settings.debug) main(settings, restart=True) except Exception, e: Log.error("Can not start", e)
def make_test_instance(db_settings): if not db_settings.filename: Log.note("Database schema will not be touched") return with Timer("Make database instance"): try: #CLEAR SCHEMA Log.note("Make empty {{schema}} schema", {"schema":db_settings.schema}) no_schema=db_settings.copy() no_schema.schema = None with DB(no_schema) as db: db.execute("DROP DATABASE IF EXISTS {{schema}}", {"schema":db.quote_column(db_settings.schema)}) db.execute("CREATE DATABASE {{schema}}", {"schema":db.quote_column(db_settings.schema)}) #FILL SCHEMA Log.note("Fill {{schema}} schema with data", {"schema":db_settings.schema}) DB.execute_file(db_settings, db_settings.filename) #ADD MISSING TABLES with DB(db_settings) as db: db.execute(""" CREATE TABLE `longdescs_tags` ( `id` mediumint(9) NOT NULL AUTO_INCREMENT, `comment_id` int(11) DEFAULT NULL, `tag` varchar(24) NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `longdescs_tags_idx` (`comment_id`,`tag`), CONSTRAINT `fk_longdescs_tags_comment_id_longdescs_comment_id` FOREIGN KEY (`comment_id`) REFERENCES `longdescs` (`comment_id`) ON DELETE CASCADE ON UPDATE CASCADE ) DEFAULT CHARSET=utf8""") except Exception, e: Log.error("Can not setup test database", e)
def test_ambiguous_whiteboard_screened(self): GOOD_BUG_TO_TEST=1046 database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #MARK BUG AS ONE OF THE SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0]) #MARK BUG AS ONE OF THE *NOT* SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, "not screened") db.flush() #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes param.allow_private_bugs = True with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST) for v in versions: if v.status_whiteboard not in (None, "", "[screened]"): Log.error("Expecting whiteboard to be screened")
def test_private_etl(self): """ ENSURE IDENTIFIABLE INFORMATION DOES NOT EXIST ON ANY BUGS """ File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() self.settings.param.allow_private_bugs = True database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.fake.bugs) es_comments = elasticsearch.make_test_instance("candidate_comments", self.settings.fake.comments) bz_etl.main(self.settings, es, es_comments) ref = elasticsearch.open_test_instance("reference", self.settings.private_bugs_reference) compare_both(es, ref, self.settings, self.settings.param.bugs) #DIRECT COMPARE THE FILE JSON can = File(self.settings.fake.comments.filename).read() ref = File(self.settings.private_comments_reference.filename).read() if can != ref: for i, c in enumerate(can): found = -1 if can[i] != ref[i]: found = i break Log.error("Comments do not match reference\n{{sample}}", {"sample": can[MIN([0, found - 100]):found + 100]})
def test_incremental_etl_catches_tracking_flags(self): database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME param.start_time = CNV.datetime2milli(CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S")) param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([813650]) param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, 813650) flags = ["cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18"] for v in versions: if v.modified_ts>param.start_time: for f in flags: if v[f] != "fixed": Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})
def get_pending(es, since): result = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": { "range": {"modified_ts": {"gte": CNV.datetime2milli(since)}}} }}, "from": 0, "size": 0, "sort": [], "facets": {"default": {"terms": {"field": "bug_id", "size": 200000}}} }) if len(result.facets.default.terms) >= 200000: Log.error("Can not handle more than 200K bugs changed") pending_bugs = Multiset( result.facets.default.terms, key_field="term", count_field="count" ) Log.note("Source has {{num}} bug versions for updating", { "num": len(pending_bugs) }) return pending_bugs
def verify_no_private_bugs(es, private_bugs): #VERIFY BUGS ARE NOT IN OUTPUT for b in private_bugs: versions = compare_es.get_all_bug_versions(es, b) if versions: Log.error("Expecting no version for private bug {{bug_id}}", {"bug_id": b})
def tearDown(self): #CLOSE THE CACHED DB CONNECTIONS bz_etl.close_db_connections() if all_db: Log.error("not all db connections are closed") Log.stop()
def verify_public_bugs(es, private_bugs): #VERIFY BUGS ARE IN OUTPUT for b in private_bugs: versions = compare_es.get_all_bug_versions(es, b) if not versions: Log.error("Expecting versions for public bug {{bug_id}}", { "bug_id": b })
def verify_no_private_bugs(es, private_bugs): #VERIFY BUGS ARE NOT IN OUTPUT for b in private_bugs: versions = compare_es.get_all_bug_versions(es, b) if versions: Log.error("Expecting no version for private bug {{bug_id}}", { "bug_id": b })
def verify_no_private_attachments(es, private_attachments): #VERIFY ATTACHMENTS ARE NOT IN OUTPUT for b in Q.select(private_attachments, "bug_id"): versions = compare_es.get_all_bug_versions(es, b) #WE ASSUME THE ATTACHMENT, IF IT EXISTS, WILL BE SOMEWHERE IN THE BUG IT #BELONGS TO, IF AT ALL for v in versions: for a in v.attachments: if a.attach_id in Q.select(private_attachments, "attach_id"): Log.error("Private attachment should not exist")
def get_private_bugs_for_delete(db, param): if param.allow_private_bugs: return {0} # NO BUGS TO DELETE try: with Timer("get all private bug ids"): private_bugs = db.query("SELECT DISTINCT bug_id FROM bug_group_map") return set(private_bugs.bug_id) | {0} except Exception, e: Log.error("problem getting private bugs", e)
def incremental_etl(settings, param, db, es, es_comments, output_queue): #################################################################### ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS #################################################################### #REMOVE PRIVATE BUGS private_bugs = get_private_bugs_for_delete(db, param) Log.note( "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": private_bugs}) for g, delete_bugs in Q.groupby(private_bugs, size=1000): still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}}) if still_existing: Log.note( "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": still_existing}) es.delete_record({"terms": {"bug_id": delete_bugs}}) es_comments.delete_record({"terms": {"bug_id": delete_bugs}}) #RECENT PUBLIC BUGS possible_public_bugs = get_recent_private_bugs(db, param) if param.allow_private_bugs: #PRIVATE BUGS # A CHANGE IN PRIVACY INDICATOR MEANS THE WHITEBOARD IS AFFECTED, REDO es.delete_record({"terms": {"bug_id": possible_public_bugs}}) else: #PUBLIC BUGS # IF ADDING GROUP THEN private_bugs ALREADY DID THIS # IF REMOVING GROUP THEN NO RECORDS TO DELETE pass #REMOVE **RECENT** PRIVATE ATTACHMENTS private_attachments = get_recent_private_attachments(db, param) bugs_to_refresh = set(Q.select(private_attachments, "bug_id")) es.delete_record({"terms": {"bug_id": bugs_to_refresh}}) #REBUILD BUGS THAT GOT REMOVED bug_list = (possible_public_bugs | bugs_to_refresh) - private_bugs # REMOVE PRIVATE BUGS if bug_list: refresh_param = param.copy() refresh_param.bug_list = bug_list refresh_param.start_time = 0 refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0) try: etl(db, output_queue, refresh_param.copy(), please_stop=None) etl_comments(db, es_comments, refresh_param.copy(), please_stop=None) except Exception, e: Log.error("Problem with etl using parameters {{parameters}}", {"parameters": refresh_param}, e)
def run_both_etl(db, output_queue, es_comments, param): comment_thread = Thread.run("etl comments", etl_comments, db, es_comments, param) process_thread = Thread.run("etl", etl, db, output_queue, param) result = comment_thread.join() if result.exception: Log.error("etl_comments had problems", result.exception) result = process_thread.join() if result.exception: Log.error("etl had problems", result.exception)
def get_private_bugs_for_delete(db, param): if param.allow_private_bugs: return {0} # NO BUGS TO DELETE try: with Timer("get all private bug ids"): private_bugs = db.query( "SELECT DISTINCT bug_id FROM bug_group_map") return set(private_bugs.bug_id) | {0} except Exception, e: Log.error("problem getting private bugs", e)
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File( settings.param.first_run_time).exists and not File( settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es( settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl( settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write( unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def test_confidential_whiteboard_is_screened(self): leaked_whiteboard = get( self.private, { "and": [ { "terms": { "bug_group": SCREENED_WHITEBOARD_BUG_GROUPS } }, { "exists": { "field": "status_whiteboard" } }, { "not": { "terms": { "status_whiteboard": ["", "[screened]"] } } }, { "range": { "expires_on": { "gte": NOW } } }, #CURRENT RECORDS { "range": { "modified_ts": { "lt": A_WHILE_AGO } } }, #OF A MINIMUM AGE ] }, fields=[ "bug_id", "product", "component", "status_whiteboard", "bug_group", "modified_ts" ], limit=100) if leaked_whiteboard: for l in leaked_whiteboard: l.modified_ts = CNV.datetime2string( CNV.milli2datetime(l.modified_ts)) Log.error("Whiteboard leaking:\b{{leak}}", {"leak": leaked_whiteboard})
def get_comments(db, param): if not param.bug_list: return [] if param.allow_private_bugs: param.comment_field = SQL("'[screened]' comment") param.bug_filter = esfilter2sqlwhere( db, {"and": [{ "terms": { "bug_id": param.bug_list } }]}) else: param.comment_field = SQL("c.thetext comment") param.bug_filter = esfilter2sqlwhere( db, { "and": [{ "terms": { "bug_id": param.bug_list } }, { "term": { "isprivate": 0 } }] }) try: comments = db.query( """ SELECT c.comment_id, c.bug_id, p.login_name modified_by, UNIX_TIMESTAMP(CONVERT_TZ(bug_when, 'US/Pacific','UTC'))*1000 AS modified_ts, {{comment_field}}, c.isprivate FROM longdescs c LEFT JOIN profiles p ON c.who = p.userid LEFT JOIN longdescs_tags t ON t.comment_id=c.comment_id AND t.tag <> 'deleted' WHERE {{bug_filter}} AND bug_when >= {{start_time_str}} """, param) return comments except Exception, e: Log.error("can not get comment data", e)
def test_incremental_has_correct_expires_on(self): # 813650, 726635 BOTH HAVE CHANGES IN 2013 bugs = struct.wrap([813650, 726635]) start_incremental=CNV.datetime2milli(CNV.string2datetime("2013-01-01", "%Y-%m-%d")) es = elasticsearch.make_test_instance("candidate", self.settings.candidate) with DB(self.settings.bugzilla) as db: #SETUP FIRST RUN PARAMETERS param = Struct() param.end_time = start_incremental param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) #SETUP INCREMENTAL RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(datetime.utcnow()) param.start_time = start_incremental param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) for b in bugs: results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and":[ {"term":{"bug_id":b}}, {"range":{"expires_on":{"gte":CNV.datetime2milli(datetime.utcnow())}}} ]} }}, "from": 0, "size": 200000, "sort": [], "fields": ["bug_id"] }) if results.hits.total>1: Log.error("Expecting only one active bug_version record")
def incremental_etl(settings, param, db, es, es_comments, output_queue): #################################################################### ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS #################################################################### #REMOVE PRIVATE BUGS private_bugs = get_private_bugs_for_delete(db, param) Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": private_bugs}) for g, delete_bugs in Q.groupby(private_bugs, size=1000): still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}}) if still_existing: Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": still_existing}) es.delete_record({"terms": {"bug_id": delete_bugs}}) es_comments.delete_record({"terms": {"bug_id": delete_bugs}}) #RECENT PUBLIC BUGS possible_public_bugs = get_recent_private_bugs(db, param) if param.allow_private_bugs: #PRIVATE BUGS # A CHANGE IN PRIVACY INDICATOR MEANS THE WHITEBOARD IS AFFECTED, REDO es.delete_record({"terms": {"bug_id": possible_public_bugs}}) else: #PUBLIC BUGS # IF ADDING GROUP THEN private_bugs ALREADY DID THIS # IF REMOVING GROUP THEN NO RECORDS TO DELETE pass #REMOVE **RECENT** PRIVATE ATTACHMENTS private_attachments = get_recent_private_attachments(db, param) bugs_to_refresh = set(Q.select(private_attachments, "bug_id")) es.delete_record({"terms": {"bug_id": bugs_to_refresh}}) #REBUILD BUGS THAT GOT REMOVED bug_list = (possible_public_bugs | bugs_to_refresh) - private_bugs # REMOVE PRIVATE BUGS if bug_list: refresh_param = param.copy() refresh_param.bug_list = bug_list refresh_param.start_time = 0 refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0) try: etl(db, output_queue, refresh_param.copy(), please_stop=None) etl_comments(db, es_comments, refresh_param.copy(), please_stop=None) except Exception, e: Log.error("Problem with etl using parameters {{parameters}}", { "parameters": refresh_param }, e)
def verify_no_private_comments(es, private_comments): data = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"terms": {"comment_id": private_comments}} ]} }}, "from": 0, "size": 200000, "sort": [] }) if Q.select(data.hits.hits, "_source"): Log.error("Expecting no comments")
def test_private_comments_not_leaking(self): leaked_comments = get( self.public_comments, {"term": {"isprivate": "1"}}, limit=20 ) if leaked_comments: if self.settings.param.delete: self.public_comments.delete_record( {"terms":{"bug_id":leaked_comments.bug_id}} ) Log.error("{{num}} comments marked private have leaked!\n{{comments|indent}}", { "num": len(leaked_comments), "comments": leaked_comments })
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File(settings.param.first_run_time).exists and not File(settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es(settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl(settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write(unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def test_private_comments_not_leaking(self): leaked_comments = get(self.public_comments, {"term": { "isprivate": "1" }}, limit=20) if leaked_comments: if self.settings.param.delete: self.public_comments.delete_record( {"terms": { "bug_id": leaked_comments.bug_id }}) Log.error( "{{num}} comments marked private have leaked!\n{{comments|indent}}", { "num": len(leaked_comments), "comments": leaked_comments })
def get_or_create_index(destination_settings, source): #CHECK IF INDEX, OR ALIAS, EXISTS es = ElasticSearch(destination_settings) aliases = es.get_aliases() indexes = [a for a in aliases if a.alias == destination_settings.index] if not indexes: #CREATE INDEX schema = source.get_schema() assert schema.settings assert schema.mappings ElasticSearch.create_index(destination_settings, schema, limit_replicas=True) elif len(indexes) > 1: Log.error("do not know how to replicate to more than one index") elif indexes[0].alias != None: destination_settings.alias = destination_settings.index destination_settings.index = indexes[0].index return ElasticSearch(destination_settings)
def get_bug_ids(es, filter): try: results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": filter }}, "from": 0, "size": 200000, "sort": [], "fields": ["bug_id"] }) return set(results.hits.hits.fields.bug_id) except Exception, e: Log.error("Can not get_max_bug from {{host}}/{{index}}", { "host": es.settings.host, "index": es.settings.index }, e)
def get_comments_by_id(db, comments, param): """ GET SPECIFIC COMMENTS """ if param.allow_private_bugs: return [] param.comments_filter = esfilter2sqlwhere(db, { "and": [{ "term": { "isprivate": 0 } }, { "terms": { "c.comment_id": comments } }] }) try: comments = db.query( """ SELECT c.comment_id, c.bug_id, p.login_name modified_by, UNIX_TIMESTAMP(CONVERT_TZ(bug_when, 'US/Pacific','UTC'))*1000 AS modified_ts, c.thetext comment, c.isprivate FROM longdescs c LEFT JOIN profiles p ON c.who = p.userid LEFT JOIN longdescs_tags t ON t.comment_id=c.comment_id AND t.tag <> 'deleted' WHERE {{comments_filter}} """, param) return comments except Exception, e: Log.error("can not get comment data", e)
def test_confidential_whiteboard_is_screened(self): leaked_whiteboard = get( self.private, {"and": [ {"terms": {"bug_group": SCREENED_WHITEBOARD_BUG_GROUPS}}, {"exists": {"field": "status_whiteboard"}}, {"not": {"terms": {"status_whiteboard": ["", "[screened]"]}}}, {"range": {"expires_on": {"gte": NOW}}}, #CURRENT RECORDS {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE ]}, fields=["bug_id", "product", "component", "status_whiteboard", "bug_group", "modified_ts"], limit=100 ) if leaked_whiteboard: for l in leaked_whiteboard: l.modified_ts=CNV.datetime2string(CNV.milli2datetime(l.modified_ts)) Log.error("Whiteboard leaking:\b{{leak}}", {"leak": leaked_whiteboard})
def compare_both(candidate, reference, settings, some_bugs): File(settings.param.errors).delete() try_dir = settings.param.errors + "/try/" ref_dir = settings.param.errors + "/ref/" with Timer("Comparing to reference"): found_errors = False for bug_id in some_bugs: try: versions = Q.sort( get_all_bug_versions(candidate, bug_id, datetime.utcnow()), "modified_ts") # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE if not versions: max_time = CNV.milli2datetime(settings.bugzilla.expires_on) else: max_time = CNV.milli2datetime(versions.last().modified_ts) pre_ref_versions = get_all_bug_versions( reference, bug_id, max_time) ref_versions = \ Q.sort( #ADDED TO FIX OLD PRODUCTION BUG VERSIONS [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions], "modified_ts" ) can = CNV.object2JSON(versions, pretty=True) ref = CNV.object2JSON(ref_versions, pretty=True) if can != ref: found_errors = True File(try_dir + unicode(bug_id) + ".txt").write(can) File(ref_dir + unicode(bug_id) + ".txt").write(ref) except Exception, e: found_errors = True Log.warning("Problem ETL'ing bug {{bug_id}}", {"bug_id": bug_id}, e) if found_errors: Log.error("DIFFERENCES FOUND (Differences shown in {{path}})", {"path": [try_dir, ref_dir]})
def get_max_bug_id(es): try: results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"script": {"script": "true"}} }}, "from": 0, "size": 0, "sort": [], "facets": {"0": {"statistical": {"field": "bug_id"}}} }) if results.facets["0"].count == 0: return 0 return results.facets["0"].max except Exception, e: Log.error("Can not get_max_bug from {{host}}/{{index}}", { "host": es.settings.host, "index": es.settings.index }, e)
def compare_both(candidate, reference, settings, some_bugs): File(settings.param.errors).delete() try_dir = settings.param.errors + "/try/" ref_dir = settings.param.errors + "/ref/" with Timer("Comparing to reference"): found_errors = False for bug_id in some_bugs: try: versions = Q.sort( get_all_bug_versions(candidate, bug_id, datetime.utcnow()), "modified_ts") # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE if not versions: max_time = CNV.milli2datetime(settings.bugzilla.expires_on) else: max_time = CNV.milli2datetime(versions.last().modified_ts) pre_ref_versions = get_all_bug_versions(reference, bug_id, max_time) ref_versions = \ Q.sort( #ADDED TO FIX OLD PRODUCTION BUG VERSIONS [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions], "modified_ts" ) can = CNV.object2JSON(versions, pretty=True) ref = CNV.object2JSON(ref_versions, pretty=True) if can != ref: found_errors = True File(try_dir + unicode(bug_id) + ".txt").write(can) File(ref_dir + unicode(bug_id) + ".txt").write(ref) except Exception, e: found_errors = True Log.warning("Problem ETL'ing bug {{bug_id}}", {"bug_id": bug_id}, e) if found_errors: Log.error("DIFFERENCES FOUND (Differences shown in {{path}})", { "path": [try_dir, ref_dir]} )
def diff(db, table, old_record, new_record): """ UPDATE bugs_activity WITH THE CHANGES IN RECORDS """ now = milli2string(db, CNV.datetime2milli(get_current_time(db))) changed = set(old_record.keys()) ^ set(new_record.keys()) changed |= set([k for k, v in old_record.items() if v != new_record[k]]) if table != u"bugs": prefix = table + u"." else: prefix = u"" for c in changed: fieldid = db.query( "SELECT id FROM fielddefs WHERE name={{field_name}}", {"field_name": prefix + c})[0].id if fieldid == None: Log.error("Expecting a valid field name") activity = Struct(bug_id=old_record.bug_id, who=1, bug_when=now, fieldid=fieldid, removed=old_record[c], added=new_record[c], attach_id=old_record.attach_id, comment_id=old_record.comment_id) db.insert("bugs_activity", activity) db.execute( "UPDATE bugs SET delta_ts={{now}} WHERE {{where}}", { "now": now, "where": esfilter2sqlwhere(db, {"term": { "bug_id": old_record.bug_id }}) })
def get_recent_private_bugs(db, param): """ GET ONLY BUGS THAT HAVE SWITCHED PRIVACY INDICATOR THIS LIST IS USED TO SIGNAL BUGS THAT NEED TOTAL RE-ETL """ param.field_id = PRIVATE_BUG_GROUP_FIELD_ID try: output = db.query(""" SELECT a.bug_id FROM bugs_activity a WHERE bug_when >= {{start_time_str}} AND fieldid={{field_id}} """, param) return set(output.bug_id) except Exception, e: Log.error("problem getting recent private attachments", e)
def get_comments(db, param): if not param.bug_list: return [] if param.allow_private_bugs: param.comment_field = SQL("'[screened]' comment") param.bug_filter = esfilter2sqlwhere(db, {"and": [ {"terms": {"bug_id": param.bug_list}} ]}) else: param.comment_field = SQL("c.thetext comment") param.bug_filter = esfilter2sqlwhere(db, {"and": [ {"terms": {"bug_id": param.bug_list}}, {"term": {"isprivate": 0}} ]}) try: comments = db.query(""" SELECT c.comment_id, c.bug_id, p.login_name modified_by, UNIX_TIMESTAMP(CONVERT_TZ(bug_when, 'US/Pacific','UTC'))*1000 AS modified_ts, {{comment_field}}, c.isprivate FROM longdescs c LEFT JOIN profiles p ON c.who = p.userid LEFT JOIN longdescs_tags t ON t.comment_id=c.comment_id AND t.tag <> 'deleted' WHERE {{bug_filter}} AND bug_when >= {{start_time_str}} """, param) return comments except Exception, e: Log.error("can not get comment data", e)
def get_last_updated(es): try: results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": { "range": { "modified_ts": {"gte": CNV.datetime2milli(far_back)}}} }}, "from": 0, "size": 0, "sort": [], "facets": {"0": {"statistical": {"field": "modified_ts"}}} }) if results.facets["0"].count == 0: return datetime.min return CNV.milli2datetime(results.facets["0"].max) except Exception, e: Log.error("Can not get_last_updated from {{host}}/{{index}}",{ "host": es.settings.host, "index": es.settings.index }, e)
def get_recent_private_bugs(db, param): """ GET ONLY BUGS THAT HAVE SWITCHED PRIVACY INDICATOR THIS LIST IS USED TO SIGNAL BUGS THAT NEED TOTAL RE-ETL """ param.field_id = PRIVATE_BUG_GROUP_FIELD_ID try: output = db.query( """ SELECT a.bug_id FROM bugs_activity a WHERE bug_when >= {{start_time_str}} AND fieldid={{field_id}} """, param) return set(output.bug_id) except Exception, e: Log.error("problem getting recent private attachments", e)
def get_bug_ids(es, filter): try: results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": filter } }, "from": 0, "size": 200000, "sort": [], "fields": ["bug_id"] }) return set(results.hits.hits.fields.bug_id) except Exception, e: Log.error("Can not get_max_bug from {{host}}/{{index}}", { "host": es.settings.host, "index": es.settings.index }, e)
def get_recent_private_attachments(db, param): """ GET ONLY RECENT ATTACHMENTS THAT HAVE SWITCHED PRIVACY INDICATOR THIS LIST IS USED TO SIGNAL BUGS THAT NEED TOTAL RE-ETL """ if param.allow_private_bugs: return [] param.field_id = PRIVATE_ATTACHMENT_FIELD_ID try: return db.query(""" SELECT a.attach_id, a.bug_id FROM bugs_activity a WHERE bug_when >= {{start_time_str}} AND fieldid={{field_id}} """, param) except Exception, e: Log.error("problem getting recent private attachments", e)
def verify_no_private_comments(es, private_comments): data = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "terms": { "comment_id": private_comments } }] } } }, "from": 0, "size": 200000, "sort": [] }) if Q.select(data.hits.hits, "_source"): Log.error("Expecting no comments")
def test_incremental_etl_catches_tracking_flags(self): database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME param.start_time = CNV.datetime2milli( CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S")) param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([813650]) param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, 813650) flags = [ "cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18" ] for v in versions: if v.modified_ts > param.start_time: for f in flags: if v[f] != "fixed": Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})
def get_recent_private_attachments(db, param): """ GET ONLY RECENT ATTACHMENTS THAT HAVE SWITCHED PRIVACY INDICATOR THIS LIST IS USED TO SIGNAL BUGS THAT NEED TOTAL RE-ETL """ if param.allow_private_bugs: return [] param.field_id = PRIVATE_ATTACHMENT_FIELD_ID try: return db.query( """ SELECT a.attach_id, a.bug_id FROM bugs_activity a WHERE bug_when >= {{start_time_str}} AND fieldid={{field_id}} """, param) except Exception, e: Log.error("problem getting recent private attachments", e)
def make_test_instance(db_settings): if not db_settings.filename: Log.note("Database schema will not be touched") return with Timer("Make database instance"): try: #CLEAR SCHEMA Log.note("Make empty {{schema}} schema", {"schema": db_settings.schema}) no_schema = db_settings.copy() no_schema.schema = None with DB(no_schema) as db: db.execute("DROP DATABASE IF EXISTS {{schema}}", {"schema": db.quote_column(db_settings.schema)}) db.execute("CREATE DATABASE {{schema}}", {"schema": db.quote_column(db_settings.schema)}) #FILL SCHEMA Log.note("Fill {{schema}} schema with data", {"schema": db_settings.schema}) DB.execute_file(db_settings, db_settings.filename) #ADD MISSING TABLES with DB(db_settings) as db: db.execute(""" CREATE TABLE `longdescs_tags` ( `id` mediumint(9) NOT NULL AUTO_INCREMENT, `comment_id` int(11) DEFAULT NULL, `tag` varchar(24) NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `longdescs_tags_idx` (`comment_id`,`tag`), CONSTRAINT `fk_longdescs_tags_comment_id_longdescs_comment_id` FOREIGN KEY (`comment_id`) REFERENCES `longdescs` (`comment_id`) ON DELETE CASCADE ON UPDATE CASCADE ) DEFAULT CHARSET=utf8""") except Exception, e: Log.error("Can not setup test database", e)