def test_incremental_etl_catches_tracking_flags(self): database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME param.start_time = CNV.datetime2milli(CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S")) param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([813650]) param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, 813650) flags = ["cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18"] for v in versions: if v.modified_ts>param.start_time: for f in flags: if v[f] != "fixed": Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File( settings.param.first_run_time).exists and not File( settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es( settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl( settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write( unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def test_incremental_has_correct_expires_on(self): # 813650, 726635 BOTH HAVE CHANGES IN 2013 bugs = struct.wrap([813650, 726635]) start_incremental=CNV.datetime2milli(CNV.string2datetime("2013-01-01", "%Y-%m-%d")) es = elasticsearch.make_test_instance("candidate", self.settings.candidate) with DB(self.settings.bugzilla) as db: #SETUP FIRST RUN PARAMETERS param = Struct() param.end_time = start_incremental param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) #SETUP INCREMENTAL RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(datetime.utcnow()) param.start_time = start_incremental param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) for b in bugs: results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and":[ {"term":{"bug_id":b}}, {"range":{"expires_on":{"gte":CNV.datetime2milli(datetime.utcnow())}}} ]} }}, "from": 0, "size": 200000, "sort": [], "fields": ["bug_id"] }) if results.hits.total>1: Log.error("Expecting only one active bug_version record")
def replicate(source, destination, pending, last_updated): """ COPY source RECORDS TO destination """ for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE): with Timer("Replicate {{num_bugs}} bug versions", {"num_bugs": len(bugs)}): data = source.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"terms": {"bug_id": set(bugs)}}, {"range": {"modified_ts": {"gte": CNV.datetime2milli(last_updated)} }} ]} }}, "from": 0, "size": 200000, "sort": [] }) d2 = map( lambda(x): {"id": x.id, "value": x}, map( lambda(x): transform_bugzilla.normalize(transform_bugzilla.rename_attachments(x._source), old_school=True), data.hits.hits ) ) destination.extend(d2)
def get_pending(es, since): result = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": { "range": {"modified_ts": {"gte": CNV.datetime2milli(since)}}} }}, "from": 0, "size": 0, "sort": [], "facets": {"default": {"terms": {"field": "bug_id", "size": 200000}}} }) if len(result.facets.default.terms) >= 200000: Log.error("Can not handle more than 200K bugs changed") pending_bugs = Multiset( result.facets.default.terms, key_field="term", count_field="count" ) Log.note("Source has {{num}} bug versions for updating", { "num": len(pending_bugs) }) return pending_bugs
def main(settings): #MAKE HANDLES TO CONTAINERS with DB(settings.bugzilla) as db: #REAL ES # if settings.candidate.alias is None: # settings.candidate.alias=settings.candidate.index # settings.candidate.index=settings.candidate.alias+CNV.datetime2string(datetime.utcnow(), "%Y%m%d_%H%M%S") # candidate=ElasticSearch.create_index(settings.candidate, File(settings.candidate.schema_file).read()) candidate = Fake_ES(settings.fake_es) reference = ElasticSearch(settings.reference) #SETUP RUN PARAMETERS param = Struct() param.BUGS_TABLE_COLUMNS = get_bugs_table_columns( db, settings.bugzilla.schema) param.BUGS_TABLE_COLUMNS_SQL = SQL(",\n".join( ["`" + c.column_name + "`" for c in param.BUGS_TABLE_COLUMNS])) param.BUGS_TABLE_COLUMNS = Q.select(param.BUGS_TABLE_COLUMNS, "column_name") param.END_TIME = CNV.datetime2milli(datetime.utcnow()) param.START_TIME = 0 param.alias_file = settings.param.alias_file param.BUG_IDS_PARTITION = SQL("bug_id in {{bugs}}", {"bugs": db.quote(settings.param.bugs)}) etl(db, candidate, param) #COMPARE ALL BUGS compare_both(candidate, reference, settings, settings.param.bugs)
def test_ambiguous_whiteboard_screened(self): GOOD_BUG_TO_TEST=1046 database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #MARK BUG AS ONE OF THE SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0]) #MARK BUG AS ONE OF THE *NOT* SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, "not screened") db.flush() #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes param.allow_private_bugs = True with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST) for v in versions: if v.status_whiteboard not in (None, "", "[screened]"): Log.error("Expecting whiteboard to be screened")
def get_all_bug_versions(es, bug_id, max_time): data = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "term": { "bug_id": bug_id } }, { "range": { "modified_ts": { "lte": CNV.datetime2milli(max_time) } } }] } } }, "from": 0, "size": 200000, "sort": [] }) return Q.select(data.hits.hits, "_source")
def diff(db, table, old_record, new_record): """ UPDATE bugs_activity WITH THE CHANGES IN RECORDS """ now = milli2string(db, CNV.datetime2milli(get_current_time(db))) changed = set(old_record.keys()) ^ set(new_record.keys()) changed |= set([k for k, v in old_record.items() if v != new_record[k]]) if table != u"bugs": prefix = table + u"." else: prefix = u"" for c in changed: fieldid=db.query("SELECT id FROM fielddefs WHERE name={{field_name}}", {"field_name": prefix + c})[0].id if fieldid == None: Log.error("Expecting a valid field name") activity = Struct( bug_id=old_record.bug_id, who=1, bug_when=now, fieldid=fieldid, removed=old_record[c], added=new_record[c], attach_id=old_record.attach_id, comment_id=old_record.comment_id ) db.insert("bugs_activity", activity) db.execute("UPDATE bugs SET delta_ts={{now}} WHERE {{where}}", { "now":now, "where":esfilter2sqlwhere(db, {"term":{"bug_id":old_record.bug_id}}) })
def main(settings): #MAKE HANDLES TO CONTAINERS with DB(settings.bugzilla) as db: #REAL ES # if settings.candidate.alias is None: # settings.candidate.alias=settings.candidate.index # settings.candidate.index=settings.candidate.alias+CNV.datetime2string(datetime.utcnow(), "%Y%m%d_%H%M%S") # candidate=ElasticSearch.create_index(settings.candidate, File(settings.candidate.schema_file).read()) candidate=Fake_ES(settings.fake_es) reference=ElasticSearch(settings.reference) #SETUP RUN PARAMETERS param=Struct() param.BUGS_TABLE_COLUMNS=get_bugs_table_columns(db, settings.bugzilla.schema) param.BUGS_TABLE_COLUMNS_SQL=SQL(",\n".join(["`"+c.column_name+"`" for c in param.BUGS_TABLE_COLUMNS])) param.BUGS_TABLE_COLUMNS=Q.select(param.BUGS_TABLE_COLUMNS, "column_name") param.END_TIME=CNV.datetime2milli(datetime.utcnow()) param.START_TIME=0 param.alias_file=settings.param.alias_file param.BUG_IDS_PARTITION=SQL("bug_id in {{bugs}}", {"bugs":db.quote(settings.param.bugs)}) etl(db, candidate, param) #COMPARE ALL BUGS compare_both(candidate, reference, settings, settings.param.bugs)
def test_specific_bugs(self): """ USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs. COMPARE THOSE VERSIONS TO A REFERENCE ES (ALSO CHECKED INTO REPOSITORY) """ # settings.param.allow_private_bugs = True database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate) reference = elasticsearch.open_test_instance("reference", self.settings.private_bugs_reference) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = self.settings.param.bugs param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(candidate, size=1000) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING compare_both(candidate, reference, self.settings, self.settings.param.bugs)
def test_specific_bugs(self): """ USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs. COMPARE THOSE VERSIONS TO A REFERENCE ES (ALSO CHECKED INTO REPOSITORY) """ # settings.param.allow_private_bugs = True database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.candidate) reference = elasticsearch.open_test_instance( "reference", self.settings.private_bugs_reference) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = self.settings.param.bugs param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(candidate, size=1000) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING compare_both(candidate, reference, self.settings, self.settings.param.bugs)
def test_whiteboard_screened(self): GOOD_BUG_TO_TEST = 1046 database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #MARK BUG AS ONE OF THE SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0]) db.flush() #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([ GOOD_BUG_TO_TEST ]) # bug 1046 sees lots of whiteboard, and other field, changes param.allow_private_bugs = True with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST) for v in versions: if v.status_whiteboard not in (None, "", "[screened]"): Log.error("Expecting whiteboard to be screened")
def old2new(bug): #THESE ARE KNOWN CHANGES THAT SHOULD BE MADE TO THE PRODUCTION VERSION bug.id=bug._id.replace(".", "_")[:-3] bug._id=None if bug.everconfirmed is not None: bug.everconfirmed=int(bug.everconfirmed) if bug.votes is not None: bug.votes=int(bug.votes) bug.dupe_by=CNV.value2intlist(bug.dupe_by) if bug.votes==0: del bug["votes"] if Math.is_integer(bug.remaining_time) and int(bug.remaining_time)==0: del bug["remaining_time"] if bug.cf_due_date is not None: bug.cf_due_date=CNV.datetime2milli(CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d")) if bug.everconfirmed==0: del bug["everconfirmed"] try: bug.cf_last_resolved=CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S")) except Exception, e: pass
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [ b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs ] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs": some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning( "Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def old2new(bug, max_date): """ CONVERT THE OLD ES FORMAT TO THE NEW THESE ARE KNOWN CHANGES THAT SHOULD BE MADE TO THE PRODUCTION VERSION """ if bug.everconfirmed != None: if bug.everconfirmed == "": bug.everconfirmed = None else: bug.everconfirmed = int(bug.everconfirmed) bug = CNV.JSON2object(CNV.object2JSON(bug).replace("bugzilla: other b.m.o issues ", "bugzilla: other b.m.o issues")) if bug.expires_on > max_date: bug.expires_on = parse_bug_history.MAX_TIME if bug.votes != None: bug.votes = int(bug.votes) bug.dupe_by = CNV.value2intlist(bug.dupe_by) if bug.votes == 0: del bug["votes"] # if Math.is_integer(bug.remaining_time) and int(bug.remaining_time) == 0: # bug.remaining_time = 0 if bug.cf_due_date != None and not Math.is_number(bug.cf_due_date): bug.cf_due_date = CNV.datetime2milli( CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d") ) bug.changes = CNV.JSON2object( CNV.object2JSON(Q.sort(bug.changes, "field_name")) \ .replace("\"field_value_removed\":", "\"old_value\":") \ .replace("\"field_value\":", "\"new_value\":") ) if bug.everconfirmed == 0: del bug["everconfirmed"] if bug.id == "692436_1336314345": bug.votes = 3 try: if Math.is_number(bug.cf_last_resolved): bug.cf_last_resolved = long(bug.cf_last_resolved) else: bug.cf_last_resolved = CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S")) except Exception, e: pass
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File(settings.param.first_run_time).exists and not File(settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es(settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl(settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write(unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def random_sample_of_bugs(settings): NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(settings.bugzilla) as db: candidate = Fake_ES(settings.fake_es) reference = ElasticSearch(settings.reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [ b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs ] #SETUP RUN PARAMETERS param = Struct() param.BUGS_TABLE_COLUMNS = get_bugs_table_columns( db, settings.bugzilla.schema) param.BUGS_TABLE_COLUMNS_SQL = SQL(",\n".join( ["`" + c.column_name + "`" for c in param.BUGS_TABLE_COLUMNS])) param.BUGS_TABLE_COLUMNS = Q.select(param.BUGS_TABLE_COLUMNS, "column_name") param.END_TIME = CNV.datetime2milli(datetime.utcnow()) param.START_TIME = 0 param.alias_file = settings.param.alias_file param.BUG_IDS_PARTITION = SQL("bug_id in {{bugs}}", {"bugs": db.quote(some_bugs)}) try: etl(db, candidate, param) #COMPARE ALL BUGS found_errors = compare_both(candidate, reference, settings, some_bugs) if found_errors: D.println("Errors found") break else: pass except Exception, e: D.warning("Total faiure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def old2new(bug): #THESE ARE KNOWN CHANGES THAT SHOULD BE MADE TO THE PRODUCTION VERSION bug.id = bug._id.replace(".", "_")[:-3] bug._id = None if bug.everconfirmed is not None: bug.everconfirmed = int(bug.everconfirmed) if bug.votes is not None: bug.votes = int(bug.votes) bug.dupe_by = CNV.value2intlist(bug.dupe_by) if bug.votes == 0: del bug["votes"] if Math.is_integer(bug.remaining_time) and int(bug.remaining_time) == 0: del bug["remaining_time"] if bug.cf_due_date is not None: bug.cf_due_date = CNV.datetime2milli( CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d")) if bug.everconfirmed == 0: del bug["everconfirmed"] try: bug.cf_last_resolved = CNV.datetime2milli( CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S")) except Exception, e: pass
def test_incremental_etl_catches_tracking_flags(self): database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME param.start_time = CNV.datetime2milli( CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S")) param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([813650]) param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, 813650) flags = [ "cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18" ] for v in versions: if v.modified_ts > param.start_time: for f in flags: if v[f] != "fixed": Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})
def get_all_bug_versions(es, bug_id, max_time): data=es.search({ "query":{"filtered":{ "query":{"match_all":{}}, "filter":{"and":[ {"term":{"bug_id":bug_id}}, {"range":{"modified_ts":{"lte":CNV.datetime2milli(max_time)}}} ]} }}, "from":0, "size":200000, "sort":[] }) return Q.select(data.hits.hits, "_source")
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs":some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning("Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def random_sample_of_bugs(settings): NUM_TO_TEST=100 MAX_BUG_ID=900000 with DB(settings.bugzilla) as db: candidate=Fake_ES(settings.fake_es) reference=ElasticSearch(settings.reference) #GO FASTER BY STORING LOCAL FILE local_cache=File(settings.param.temp_dir+"/private_bugs.json") if local_cache.exists: private_bugs=set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs= compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs=[b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs] #SETUP RUN PARAMETERS param=Struct() param.BUGS_TABLE_COLUMNS=get_bugs_table_columns(db, settings.bugzilla.schema) param.BUGS_TABLE_COLUMNS_SQL=SQL(",\n".join(["`"+c.column_name+"`" for c in param.BUGS_TABLE_COLUMNS])) param.BUGS_TABLE_COLUMNS=Q.select(param.BUGS_TABLE_COLUMNS, "column_name") param.END_TIME=CNV.datetime2milli(datetime.utcnow()) param.START_TIME=0 param.alias_file=settings.param.alias_file param.BUG_IDS_PARTITION=SQL("bug_id in {{bugs}}", {"bugs":db.quote(some_bugs)}) try: etl(db, candidate, param) #COMPARE ALL BUGS found_errors=compare_both(candidate, reference, settings, some_bugs) if found_errors: D.println("Errors found") break else: pass except Exception, e: D.warning("Total faiure during compare of bugs {{bugs}}", {"bugs":some_bugs}, e)
def test_specific_bugs(self): """ USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs. """ with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance("candidate", self.settings.elasticsearch) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = self.settings.param.bugs param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(candidate, size=1000) as output: etl(db, output, param, please_stop=None)
def test_specific_bugs(self): """ USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs. """ with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.elasticsearch) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = self.settings.param.bugs param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(candidate, size=1000) as output: etl(db, output, param, please_stop=None)
def diff(db, table, old_record, new_record): """ UPDATE bugs_activity WITH THE CHANGES IN RECORDS """ now = milli2string(db, CNV.datetime2milli(get_current_time(db))) changed = set(old_record.keys()) ^ set(new_record.keys()) changed |= set([k for k, v in old_record.items() if v != new_record[k]]) if table != u"bugs": prefix = table + u"." else: prefix = u"" for c in changed: fieldid = db.query( "SELECT id FROM fielddefs WHERE name={{field_name}}", {"field_name": prefix + c})[0].id if fieldid == None: Log.error("Expecting a valid field name") activity = Struct(bug_id=old_record.bug_id, who=1, bug_when=now, fieldid=fieldid, removed=old_record[c], added=new_record[c], attach_id=old_record.attach_id, comment_id=old_record.comment_id) db.insert("bugs_activity", activity) db.execute( "UPDATE bugs SET delta_ts={{now}} WHERE {{where}}", { "now": now, "where": esfilter2sqlwhere(db, {"term": { "bug_id": old_record.bug_id }}) })
def get_last_updated(es): try: results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": { "range": { "modified_ts": {"gte": CNV.datetime2milli(far_back)}}} }}, "from": 0, "size": 0, "sort": [], "facets": {"0": {"statistical": {"field": "modified_ts"}}} }) if results.facets["0"].count == 0: return datetime.min return CNV.milli2datetime(results.facets["0"].max) except Exception, e: Log.error("Can not get_last_updated from {{host}}/{{index}}",{ "host": es.settings.host, "index": es.settings.index }, e)
def main(settings): #USE A FILE if settings.source.filename != None: settings.destination.alias = settings.destination.index settings.destination.index = ElasticSearch.proto_name(settings.destination.alias) schema = CNV.JSON2object(File(settings.source.schema_filename).read()) if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = CNV.JSON2object(CNV.object2JSON(schema).replace("attachments_", "attachments.")) dest = ElasticSearch.create_index(settings.destination, schema, limit_replicas=True) dest.set_refresh_interval(-1) extract_from_file(settings.source, dest) dest.set_refresh_interval(1) dest.delete_all_but(settings.destination.alias, settings.destination.index) dest.add_alias(settings.destination.alias) return # SYNCH WITH source ES INDEX source=ElasticSearch(settings.source) destination=get_or_create_index(settings["destination"], source) # GET LAST UPDATED time_file = File(settings.param.last_replication_time) from_file = None if time_file.exists: from_file = CNV.milli2datetime(CNV.value2int(time_file.read())) from_es = get_last_updated(destination) last_updated = nvl(MIN(from_file, from_es), CNV.milli2datetime(0)) current_time = datetime.utcnow() pending = get_pending(source, last_updated) with ThreadedQueue(destination, size=1000) as data_sink: replicate(source, data_sink, pending, last_updated) # RECORD LAST UPDATED time_file.write(unicode(CNV.datetime2milli(current_time)))
limit_replicas=True) # BUG COMMENTS comment_schema = File(settings.es_comments.schema_file).read() comment_schema = CNV.JSON2object(comment_schema, paths=True) comment_schema.settings = jsons.expand_dot(comment_schema.settings) if not settings.es_comments.alias: settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = ElasticSearch.proto_name( settings.es_comments.alias) es_comments = ElasticSearch.create_index(settings.es_comments, comment_schema, limit_replicas=True) File(settings.param.first_run_time).write( unicode(CNV.datetime2milli(current_run_time))) return current_run_time, es, es_comments, last_run_time def incremental_etl(settings, param, db, es, es_comments, output_queue): #################################################################### ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS #################################################################### #REMOVE PRIVATE BUGS private_bugs = get_private_bugs_for_delete(db, param) Log.note( "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": private_bugs})
def normalize(bug, old_school=False): bug=bug.copy() bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3] bug._id = None #ENSURE STRUCTURES ARE SORTED # Do some processing to make sure that diffing between runs stays as similar as possible. bug.flags=Q.sort(bug.flags, "value") if bug.attachments: if USE_ATTACHMENTS_DOT: bug.attachments=CNV.JSON2object(CNV.object2JSON(bug.attachments).replace("attachments_", "attachments.")) bug.attachments = Q.sort(bug.attachments, "attach_id") for a in bug.attachments: for k,v in list(a.items()): if k.startswith("attachments") and (k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate")): new_v=CNV.value2int(v) new_k=k[12:] a[k.replace(".", "\.")]=new_v if not old_school: a[new_k]=new_v a.flags = Q.sort(a.flags, ["modified_ts", "value"]) if bug.changes != None: if USE_ATTACHMENTS_DOT: json = CNV.object2JSON(bug.changes).replace("attachments_", "attachments.") bug.changes=CNV.JSON2object(json) bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"]) #bug IS CONVERTED TO A 'CLEAN' COPY bug = ElasticSearch.scrub(bug) # bug.attachments = nvl(bug.attachments, []) # ATTACHMENTS MUST EXIST for f in NUMERIC_FIELDS: v = bug[f] if v == None: continue elif f in MULTI_FIELDS: bug[f] = CNV.value2intlist(v) elif CNV.value2number(v) == 0: del bug[f] else: bug[f]=CNV.value2number(v) # Also reformat some date fields for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]: v = bug[dateField] if v == None: continue try: if isinstance(v, date): bug[dateField] = CNV.datetime2milli(v) elif isinstance(v, long) and len(unicode(v)) in [12, 13]: bug[dateField] = v elif not isinstance(v, basestring): Log.error("situation not handled") elif DATE_PATTERN_STRICT.match(v): # Convert to "2012/01/01 00:00:00.000" # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f")) elif DATE_PATTERN_STRICT_SHORT.match(v): # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp. # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S")) elif DATE_PATTERN_RELAXED.match(v): # Convert "2012/01/01 00:00:00.000" to "2012-01-01" # Example: bug 643420 (deadline) # bug 726635 (cf_due_date) bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v[0:10], "%Y-%m-%d")) except Exception, e: Log.error("problem with converting date to milli (value={{value}})", {"value":bug[dateField]}, e)
from datetime import datetime import unittest from pymysql.times import TimeDelta from bzETL.extract_bugzilla import SCREENED_WHITEBOARD_BUG_GROUPS from bzETL.util.env import startup from bzETL.util import struct from bzETL.util.cnv import CNV from bzETL.util.env.elasticsearch import ElasticSearch from bzETL.util.env.emailer import Emailer from bzETL.util.env.logs import Log from bzETL.util.maths import Math from bzETL.util.queries import Q from bzETL.util.struct import nvl NOW = CNV.datetime2milli(datetime.utcnow()) A_WHILE_AGO = int(NOW - TimeDelta(minutes=10).total_seconds() * 1000) class TestLookForLeaks(unittest.TestCase): def setUp(self): settings = startup.read_settings(filename="leak_check_settings.json") Log.start(settings.debug) self.private = ElasticSearch(settings.private) self.public = ElasticSearch(settings.public) self.public_comments = ElasticSearch(settings.public_comments) self.settings = settings def tearDown(self): Log.stop() def blocks_of_bugs(self):
def normalize(bug, old_school=False): bug = bug.copy() bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3] bug._id = None #ENSURE STRUCTURES ARE SORTED # Do some processing to make sure that diffing between runs stays as similar as possible. bug.flags = Q.sort(bug.flags, "value") if bug.attachments: if USE_ATTACHMENTS_DOT: bug.attachments = CNV.JSON2object( CNV.object2JSON(bug.attachments).replace( "attachments_", "attachments.")) bug.attachments = Q.sort(bug.attachments, "attach_id") for a in bug.attachments: for k, v in list(a.items()): if k.startswith("attachments") and (k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate")): new_v = CNV.value2int(v) new_k = k[12:] a[k.replace(".", "\.")] = new_v if not old_school: a[new_k] = new_v a.flags = Q.sort(a.flags, ["modified_ts", "value"]) if bug.changes != None: if USE_ATTACHMENTS_DOT: json = CNV.object2JSON(bug.changes).replace( "attachments_", "attachments.") bug.changes = CNV.JSON2object(json) bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"]) #bug IS CONVERTED TO A 'CLEAN' COPY bug = ElasticSearch.scrub(bug) # bug.attachments = nvl(bug.attachments, []) # ATTACHMENTS MUST EXIST for f in NUMERIC_FIELDS: v = bug[f] if v == None: continue elif f in MULTI_FIELDS: bug[f] = CNV.value2intlist(v) elif CNV.value2number(v) == 0: del bug[f] else: bug[f] = CNV.value2number(v) # Also reformat some date fields for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]: v = bug[dateField] if v == None: continue try: if isinstance(v, date): bug[dateField] = CNV.datetime2milli(v) elif isinstance(v, long) and len(unicode(v)) in [12, 13]: bug[dateField] = v elif not isinstance(v, basestring): Log.error("situation not handled") elif DATE_PATTERN_STRICT.match(v): # Convert to "2012/01/01 00:00:00.000" # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v + "000", "%Y/%m/%d %H:%M%:S%f")) elif DATE_PATTERN_STRICT_SHORT.match(v): # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp. # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S")) elif DATE_PATTERN_RELAXED.match(v): # Convert "2012/01/01 00:00:00.000" to "2012-01-01" # Example: bug 643420 (deadline) # bug 726635 (cf_due_date) bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v[0:10], "%Y-%m-%d")) except Exception, e: Log.error( "problem with converting date to milli (value={{value}})", {"value": bug[dateField]}, e)
def test_changes_to_private_bugs_still_have_bug_group(self): self.settings.param.allow_private_bugs = True File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs}) database.make_test_instance(self.settings.bugzilla) #MARK SOME BUGS PRIVATE with DB(self.settings.bugzilla) as db: for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) # MAKE A CHANGE TO THE PRIVATE BUGS with DB(self.settings.bugzilla) as db: for b in private_bugs: old_bug = db.query( "SELECT * FROM bugs WHERE bug_id={{bug_id}}", {"bug_id": b})[0] new_bug = old_bug.copy() new_bug.bug_status = "NEW STATUS" diff(db, "bugs", old_bug, new_bug) #RUN INCREMENTAL bz_etl.main(self.settings, es, es_c) #VERIFY BUG GROUP STILL EXISTS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING now = datetime.utcnow() results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "terms": { "bug_id": private_bugs } }, { "range": { "expires_on": { "gte": CNV.datetime2milli(now) } } }] } } }, "from": 0, "size": 200000, "sort": [] }) latest_bugs = Q.select(results.hits.hits, "_source") latest_bugs_index = Q.unique_index( latest_bugs, "bug_id") # IF NOT UNIQUE, THEN ETL IS WRONG for bug_id in private_bugs: if latest_bugs_index[bug_id] == None: Log.error("Expecting to find the private bug {{bug_id}}", {"bug_id": bug_id}) bug_group = latest_bugs_index[bug_id].bug_group if not bug_group: Log.error( "Expecting private bug ({{bug_id}}) to have a bug group", {"bug_id": bug_id}) if BUG_GROUP_FOR_TESTING not in bug_group: Log.error( "Expecting private bug ({{bug_id}}) to have a \"{{bug_group}}\" bug group", { "bug_id": bug_id, "bug_group": BUG_GROUP_FOR_TESTING })
def test_incremental_has_correct_expires_on(self): # 813650, 726635 BOTH HAVE CHANGES IN 2013 bugs = struct.wrap([813650, 726635]) start_incremental = CNV.datetime2milli( CNV.string2datetime("2013-01-01", "%Y-%m-%d")) es = elasticsearch.make_test_instance("candidate", self.settings.candidate) with DB(self.settings.bugzilla) as db: #SETUP FIRST RUN PARAMETERS param = Struct() param.end_time = start_incremental param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) #SETUP INCREMENTAL RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(datetime.utcnow()) param.start_time = start_incremental param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) for b in bugs: results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "term": { "bug_id": b } }, { "range": { "expires_on": { "gte": CNV.datetime2milli(datetime.utcnow()) } } }] } } }, "from": 0, "size": 200000, "sort": [], "fields": ["bug_id"] }) if results.hits.total > 1: Log.error("Expecting only one active bug_version record")
def test_changes_to_private_bugs_still_have_bug_group(self): self.settings.param.allow_private_bugs = True File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs}) database.make_test_instance(self.settings.bugzilla) #MARK SOME BUGS PRIVATE with DB(self.settings.bugzilla) as db: for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) # MAKE A CHANGE TO THE PRIVATE BUGS with DB(self.settings.bugzilla) as db: for b in private_bugs: old_bug = db.query("SELECT * FROM bugs WHERE bug_id={{bug_id}}", {"bug_id": b})[0] new_bug = old_bug.copy() new_bug.bug_status = "NEW STATUS" diff(db, "bugs", old_bug, new_bug) #RUN INCREMENTAL bz_etl.main(self.settings, es, es_c) #VERIFY BUG GROUP STILL EXISTS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING now = datetime.utcnow() results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"terms": {"bug_id": private_bugs}}, {"range": {"expires_on": {"gte": CNV.datetime2milli(now)}}} ]} }}, "from": 0, "size": 200000, "sort": [] }) latest_bugs = Q.select(results.hits.hits, "_source") latest_bugs_index = Q.unique_index(latest_bugs, "bug_id") # IF NOT UNIQUE, THEN ETL IS WRONG for bug_id in private_bugs: if latest_bugs_index[bug_id] == None: Log.error("Expecting to find the private bug {{bug_id}}", {"bug_id": bug_id}) bug_group = latest_bugs_index[bug_id].bug_group if not bug_group: Log.error("Expecting private bug ({{bug_id}}) to have a bug group", {"bug_id": bug_id}) if BUG_GROUP_FOR_TESTING not in bug_group: Log.error("Expecting private bug ({{bug_id}}) to have a \"{{bug_group}}\" bug group", { "bug_id": bug_id, "bug_group": BUG_GROUP_FOR_TESTING })
from datetime import datetime import unittest from pymysql.times import TimeDelta from bzETL.extract_bugzilla import SCREENED_WHITEBOARD_BUG_GROUPS from bzETL.util.env import startup from bzETL.util import struct from bzETL.util.cnv import CNV from bzETL.util.env.elasticsearch import ElasticSearch from bzETL.util.env.emailer import Emailer from bzETL.util.env.logs import Log from bzETL.util.maths import Math from bzETL.util.queries import Q from bzETL.util.struct import nvl NOW = CNV.datetime2milli(datetime.utcnow()) A_WHILE_AGO = int(NOW - TimeDelta(minutes=10).total_seconds()*1000) class TestLookForLeaks(unittest.TestCase): def setUp(self): settings = startup.read_settings(filename="leak_check_settings.json") Log.start(settings.debug) self.private = ElasticSearch(settings.private) self.public = ElasticSearch(settings.public) self.public_comments = ElasticSearch(settings.public_comments) self.settings = settings def tearDown(self): Log.stop()
schema.settings=jsons.expand_dot(schema.settings) if not settings.es.alias: settings.es.alias = settings.es.index settings.es.index = ElasticSearch.proto_name(settings.es.alias) es = ElasticSearch.create_index(settings.es, schema, limit_replicas=True) # BUG COMMENTS comment_schema = File(settings.es_comments.schema_file).read() comment_schema=CNV.JSON2object(comment_schema, paths=True) comment_schema.settings=jsons.expand_dot(comment_schema.settings) if not settings.es_comments.alias: settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = ElasticSearch.proto_name(settings.es_comments.alias) es_comments = ElasticSearch.create_index(settings.es_comments, comment_schema, limit_replicas=True) File(settings.param.first_run_time).write(unicode(CNV.datetime2milli(current_run_time))) return current_run_time, es, es_comments, last_run_time def incremental_etl(settings, param, db, es, es_comments, output_queue): #################################################################### ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS #################################################################### #REMOVE PRIVATE BUGS private_bugs = get_private_bugs_for_delete(db, param) Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": private_bugs}) for g, delete_bugs in Q.groupby(private_bugs, size=1000): still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}})