def make_test_instance(db_settings): if not db_settings.filename: Log.note("Database schema will not be touched") return with Timer("Make database instance"): try: #CLEAR SCHEMA Log.note("Make empty {{schema}} schema", {"schema":db_settings.schema}) no_schema=db_settings.copy() no_schema.schema = None with DB(no_schema) as db: db.execute("DROP DATABASE IF EXISTS {{schema}}", {"schema":db.quote_column(db_settings.schema)}) db.execute("CREATE DATABASE {{schema}}", {"schema":db.quote_column(db_settings.schema)}) #FILL SCHEMA Log.note("Fill {{schema}} schema with data", {"schema":db_settings.schema}) DB.execute_file(db_settings, db_settings.filename) #ADD MISSING TABLES with DB(db_settings) as db: db.execute(""" CREATE TABLE `longdescs_tags` ( `id` mediumint(9) NOT NULL AUTO_INCREMENT, `comment_id` int(11) DEFAULT NULL, `tag` varchar(24) NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `longdescs_tags_idx` (`comment_id`,`tag`), CONSTRAINT `fk_longdescs_tags_comment_id_longdescs_comment_id` FOREIGN KEY (`comment_id`) REFERENCES `longdescs` (`comment_id`) ON DELETE CASCADE ON UPDATE CASCADE ) DEFAULT CHARSET=utf8""") except Exception, e: Log.error("Can not setup test database", e)
def test_private_comments_do_not_show(self): self.settings.param.allow_private_bugs = False database.make_test_instance(self.settings.bugzilla) #MARK SOME COMMENTS PRIVATE with DB(self.settings.bugzilla) as db: private_comments = db.query(""" SELECT bug_id, comment_id FROM longdescs ORDER BY mod(comment_id, 7), comment_id LIMIT 5 """) for c in private_comments: database.mark_comment_private(db, c.comment_id, 1) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING verify_no_private_comments(es, private_comments)
def test_specific_bugs(self): """ USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs. COMPARE THOSE VERSIONS TO A REFERENCE ES (ALSO CHECKED INTO REPOSITORY) """ # settings.param.allow_private_bugs = True database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.candidate) reference = elasticsearch.open_test_instance( "reference", self.settings.private_bugs_reference) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = self.settings.param.bugs param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(candidate, size=1000) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING compare_both(candidate, reference, self.settings, self.settings.param.bugs)
def test_whiteboard_screened(self): GOOD_BUG_TO_TEST = 1046 database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #MARK BUG AS ONE OF THE SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0]) db.flush() #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([ GOOD_BUG_TO_TEST ]) # bug 1046 sees lots of whiteboard, and other field, changes param.allow_private_bugs = True with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST) for v in versions: if v.status_whiteboard not in (None, "", "[screened]"): Log.error("Expecting whiteboard to be screened")
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [ b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs ] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs": some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning( "Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File( settings.param.first_run_time).exists and not File( settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es( settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl( settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write( unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def main(): """ MEANT TO BE RUN JUST ONCE IN DEVELOPMENT TO CONVERT A BIG PUBLIC DATABASE (8G+) INTO A TINY TESTING DB (FOR ADDING TO REPOSITORY) """ try: settings=startup.read_settings() Log.start(settings.debug) input=raw_input("We are going to totally wipe out the "+settings.bugzilla.schema.upper()+" schema at "+settings.bugzilla.host.upper()+"! Type \"YES\" to continue: ") if input!="YES": Log.note("Aborted. No Changes made.") return Log.note("Scrubbing db of those pesky records.") Log.note("This is going to take hours ...") DB.execute_file(settings.bugzilla, "./tests/resources/sql/scrub_db.sql", { "schema":settings.bugzilla.schema, "bug_list":SQL(settings.param.bugs) }) Log.note("... Done!") finally: Log.stop()
def etl_comments(db, es, param, please_stop): # CONNECTIONS ARE EXPENSIVE, CACHE HERE with comment_db_cache_lock: if not comment_db_cache: comment_db = DB(db) comment_db_cache.append(comment_db) with comment_db_cache_lock: Log.note("Read comments from database") comments = get_comments(comment_db_cache[0], param) for g, c in Q.groupby(comments, size=500): with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}): es.extend({"id": cc.comment_id, "value": cc} for cc in c)
def make_test_instance(db_settings): if not db_settings.filename: Log.note("Database schema will not be touched") return with Timer("Make database instance"): try: #CLEAR SCHEMA Log.note("Make empty {{schema}} schema", {"schema": db_settings.schema}) no_schema = db_settings.copy() no_schema.schema = None with DB(no_schema) as db: db.execute("DROP DATABASE IF EXISTS {{schema}}", {"schema": db.quote_column(db_settings.schema)}) db.execute("CREATE DATABASE {{schema}}", {"schema": db.quote_column(db_settings.schema)}) #FILL SCHEMA Log.note("Fill {{schema}} schema with data", {"schema": db_settings.schema}) DB.execute_file(db_settings, db_settings.filename) #ADD MISSING TABLES with DB(db_settings) as db: db.execute(""" CREATE TABLE `longdescs_tags` ( `id` mediumint(9) NOT NULL AUTO_INCREMENT, `comment_id` int(11) DEFAULT NULL, `tag` varchar(24) NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `longdescs_tags_idx` (`comment_id`,`tag`), CONSTRAINT `fk_longdescs_tags_comment_id_longdescs_comment_id` FOREIGN KEY (`comment_id`) REFERENCES `longdescs` (`comment_id`) ON DELETE CASCADE ON UPDATE CASCADE ) DEFAULT CHARSET=utf8""") except Exception, e: Log.error("Can not setup test database", e)
def main(settings, bug_list=None, please_stop=None, restart=False): """ THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE. SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME. THIS CODE WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED OVER THE LIFETIME OF THE BUGZILLA DATA. 'PERSON' IS ABSTRACT, AND SIMPLY ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION """ if settings.args.quick: Log.note("Alias analysis skipped (--quick was used)") return if not restart: loadAliases(settings) if bug_list: with DB(settings.bugzilla, readonly=True) as db: data = get_all_cc_changes(db, bug_list) aggregator(data) analysis(settings, True, please_stop) return with DB(settings.bugzilla, readonly=True) as db: start = nvl(settings.param.start, 0) end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) #Perform analysis on blocks of bugs, in case we crash partway through for s, e in Q.intervals(start, end, settings.param.alias_increment): Log.note("Load range {{start}}-{{end}}", {"start": s, "end": e}) data = get_all_cc_changes(db, range(s, e)) if please_stop: break aggregator(data) analysis(settings, e >= end, please_stop)
def etl(db, output_queue, param, please_stop): """ PROCESS RANGE, AS SPECIFIED IN param AND PUSH BUG VERSION RECORDS TO output_queue """ # CONNECTIONS ARE EXPENSIVE, CACHE HERE with db_cache_lock: if not db_cache: with Timer("open connections to db"): for f in get_stuff_from_bugzilla: db = DB(db) db_cache.append(db) db_results = Queue(max=2**30) with db_cache_lock: # ASYMMETRIC MULTI THREADING TO GET RECORDS FROM DB with AllThread() as all: for i, f in enumerate(get_stuff_from_bugzilla): def process(target, db, param, please_stop): db_results.extend(target(db, param)) all.add(process, f, db_cache[i], param.copy()) db_results.add(Thread.STOP) sorted = Q.sort(db_results, [ "bug_id", "_merge_order", { "field": "modified_ts", "sort": -1 }, "modified_by" ]) process = BugHistoryParser(param, output_queue) for s in sorted: process.processRow(s) process.processRow( struct.wrap({ "bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1 }))
def test_specific_bugs(self): """ USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs. """ with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.elasticsearch) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = self.settings.param.bugs param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(candidate, size=1000) as output: etl(db, output, param, please_stop=None)
def test_incremental_etl_catches_tracking_flags(self): database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME param.start_time = CNV.datetime2milli( CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S")) param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([813650]) param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, 813650) flags = [ "cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18" ] for v in versions: if v.modified_ts > param.start_time: for f in flags: if v[f] != "fixed": Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})
def test_private_bugs_do_not_show(self): self.settings.param.allow_private_bugs = False File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs}) database.make_test_instance(self.settings.bugzilla) #MARK SOME BUGS PRIVATE with DB(self.settings.bugzilla) as db: for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING verify_no_private_bugs(es, private_bugs)
def test_recent_private_stuff_does_not_show(self): self.settings.param.allow_private_bugs = False File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) #MARK SOME STUFF PRIVATE with DB(self.settings.bugzilla) as db: #BUGS private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs are {{bugs}}", {"bugs": private_bugs}) for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) #COMMENTS comments = db.query("SELECT comment_id FROM longdescs").comment_id marked_private_comments = Random.sample(comments, 5) for c in marked_private_comments: database.mark_comment_private(db, c, isprivate=1) #INCLUDE COMMENTS OF THE PRIVATE BUGS implied_private_comments = db.query( """ SELECT comment_id FROM longdescs WHERE {{where}} """, { "where": esfilter2sqlwhere(db, {"terms": { "bug_id": private_bugs }}) }).comment_id private_comments = marked_private_comments + implied_private_comments Log.note("The private comments are {{comments}}", {"comments": private_comments}) #ATTACHMENTS attachments = db.query("SELECT bug_id, attach_id FROM attachments") private_attachments = Random.sample(attachments, 5) Log.note("The private attachments are {{attachments}}", {"attachments": private_attachments}) for a in private_attachments: database.mark_attachment_private(db, a.attach_id, isprivate=1) if not File(self.settings.param.last_run_time).exists: Log.error("last_run_time should exist") bz_etl.main(self.settings, es, es_c) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING verify_no_private_bugs(es, private_bugs) verify_no_private_attachments(es, private_attachments) verify_no_private_comments(es_c, private_comments) #MARK SOME STUFF PUBLIC with DB(self.settings.bugzilla) as db: for b in private_bugs: database.remove_bug_group(db, b, BUG_GROUP_FOR_TESTING) bz_etl.main(self.settings, es, es_c) #VERIFY BUG IS PUBLIC, BUT PRIVATE ATTACHMENTS AND COMMENTS STILL NOT Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING verify_public_bugs(es, private_bugs) verify_no_private_attachments(es, private_attachments) verify_no_private_comments(es_c, marked_private_comments)
def test_incremental_has_correct_expires_on(self): # 813650, 726635 BOTH HAVE CHANGES IN 2013 bugs = struct.wrap([813650, 726635]) start_incremental = CNV.datetime2milli( CNV.string2datetime("2013-01-01", "%Y-%m-%d")) es = elasticsearch.make_test_instance("candidate", self.settings.candidate) with DB(self.settings.bugzilla) as db: #SETUP FIRST RUN PARAMETERS param = Struct() param.end_time = start_incremental param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) #SETUP INCREMENTAL RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(datetime.utcnow()) param.start_time = start_incremental param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) for b in bugs: results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "term": { "bug_id": b } }, { "range": { "expires_on": { "gte": CNV.datetime2milli(datetime.utcnow()) } } }] } } }, "from": 0, "size": 200000, "sort": [], "fields": ["bug_id"] }) if results.hits.total > 1: Log.error("Expecting only one active bug_version record")
def test_changes_to_private_bugs_still_have_bug_group(self): self.settings.param.allow_private_bugs = True File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs}) database.make_test_instance(self.settings.bugzilla) #MARK SOME BUGS PRIVATE with DB(self.settings.bugzilla) as db: for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) # MAKE A CHANGE TO THE PRIVATE BUGS with DB(self.settings.bugzilla) as db: for b in private_bugs: old_bug = db.query( "SELECT * FROM bugs WHERE bug_id={{bug_id}}", {"bug_id": b})[0] new_bug = old_bug.copy() new_bug.bug_status = "NEW STATUS" diff(db, "bugs", old_bug, new_bug) #RUN INCREMENTAL bz_etl.main(self.settings, es, es_c) #VERIFY BUG GROUP STILL EXISTS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING now = datetime.utcnow() results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "terms": { "bug_id": private_bugs } }, { "range": { "expires_on": { "gte": CNV.datetime2milli(now) } } }] } } }, "from": 0, "size": 200000, "sort": [] }) latest_bugs = Q.select(results.hits.hits, "_source") latest_bugs_index = Q.unique_index( latest_bugs, "bug_id") # IF NOT UNIQUE, THEN ETL IS WRONG for bug_id in private_bugs: if latest_bugs_index[bug_id] == None: Log.error("Expecting to find the private bug {{bug_id}}", {"bug_id": bug_id}) bug_group = latest_bugs_index[bug_id].bug_group if not bug_group: Log.error( "Expecting private bug ({{bug_id}}) to have a bug group", {"bug_id": bug_id}) if BUG_GROUP_FOR_TESTING not in bug_group: Log.error( "Expecting private bug ({{bug_id}}) to have a \"{{bug_group}}\" bug group", { "bug_id": bug_id, "bug_group": BUG_GROUP_FOR_TESTING })