def start(): try: settings=startup.read_settings() Log.start(settings.debug) main(settings) except Exception, e: Log.error("Problems exist", e)
def setUp(self): settings = startup.read_settings(filename="leak_check_settings.json") Log.start(settings.debug) self.private = ElasticSearch(settings.private) self.public = ElasticSearch(settings.public) self.public_comments = ElasticSearch(settings.public_comments) self.settings = settings
def start(): try: settings = startup.read_settings(defs=[{ "name": ["--quick", "--fast"], "help": "use this to process the first and last block, useful for testing the config settings before doing a full run", "action": "store_true", "dest": "quick" }, { "name": ["--restart", "--reset", "--redo"], "help": "use this to force a reprocessing of all data", "action": "store_true", "dest": "restart" }]) with startup.SingleInstance(flavor_id=settings.args.filename): if settings.args.restart: for l in struct.listwrap(settings.debug.log): if l.filename: File(l.filename).parent.delete() File(settings.param.first_run_time).delete() File(settings.param.last_run_time).delete() Log.start(settings.debug) main(settings) except Exception, e: Log.fatal("Can not start", e)
def test_private_etl(self): """ ENSURE IDENTIFIABLE INFORMATION DOES NOT EXIST ON ANY BUGS """ File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() self.settings.param.allow_private_bugs = True database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.fake.bugs) es_comments = elasticsearch.make_test_instance( "candidate_comments", self.settings.fake.comments) bz_etl.main(self.settings, es, es_comments) ref = elasticsearch.open_test_instance( "reference", self.settings.private_bugs_reference) compare_both(es, ref, self.settings, self.settings.param.bugs) #DIRECT COMPARE THE FILE JSON can = File(self.settings.fake.comments.filename).read() ref = File(self.settings.private_comments_reference.filename).read() if can != ref: for i, c in enumerate(can): found = -1 if can[i] != ref[i]: found = i break Log.error("Comments do not match reference\n{{sample}}", {"sample": can[MIN([0, found - 100]):found + 100]})
def get_pending(es, since): result = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": { "range": {"modified_ts": {"gte": CNV.datetime2milli(since)}}} }}, "from": 0, "size": 0, "sort": [], "facets": {"default": {"terms": {"field": "bug_id", "size": 200000}}} }) if len(result.facets.default.terms) >= 200000: Log.error("Can not handle more than 200K bugs changed") pending_bugs = Multiset( result.facets.default.terms, key_field="term", count_field="count" ) Log.note("Source has {{num}} bug versions for updating", { "num": len(pending_bugs) }) return pending_bugs
def start(): try: settings = startup.read_settings() Log.start(settings.debug) main(settings, restart=True) except Exception, e: Log.error("Can not start", e)
def test_ambiguous_whiteboard_screened(self): GOOD_BUG_TO_TEST=1046 database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #MARK BUG AS ONE OF THE SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0]) #MARK BUG AS ONE OF THE *NOT* SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, "not screened") db.flush() #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes param.allow_private_bugs = True with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST) for v in versions: if v.status_whiteboard not in (None, "", "[screened]"): Log.error("Expecting whiteboard to be screened")
def get_max_bug_id(es): try: results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "script": { "script": "true" } } } }, "from": 0, "size": 0, "sort": [], "facets": { "0": { "statistical": { "field": "bug_id" } } } }) if results.facets["0"].count == 0: return 0 return results.facets["0"].max except Exception, e: Log.error("Can not get_max_bug from {{host}}/{{index}}", { "host": es.settings.host, "index": es.settings.index }, e)
def setup_es(settings, db, es, es_comments): """ SETUP ES CONNECTIONS TO REFLECT IF WE ARE RESUMING, INCREMENTAL, OR STARTING OVER """ current_run_time = get_current_time(db) if File(settings.param.first_run_time).exists and File(settings.param.last_run_time).exists: # INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX last_run_time = long(File(settings.param.last_run_time).read()) if not es: es = ElasticSearch(settings.es) es_comments = ElasticSearch(settings.es_comments) elif File(settings.param.first_run_time).exists: # DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL try: last_run_time = 0 current_run_time = long(File(settings.param.first_run_time).read()) if not es: if not settings.es.alias: temp = ElasticSearch(settings.es).get_proto(settings.es.index) settings.es.alias = settings.es.index settings.es.index = temp.last() es = ElasticSearch(settings.es) es.set_refresh_interval(1) #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY if not settings.es_comments.alias: temp = ElasticSearch(settings.es_comments).get_proto(settings.es_comments.index) settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = temp.last() es_comments = ElasticSearch(settings.es_comments) except Exception, e: Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments)
def test_public_etl(self): """ ENSURE ETL GENERATES WHAT'S IN THE REFERENCE FILE """ File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() self.settings.param.allow_private_bugs = Null database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.fake.bugs) es_comments = elasticsearch.make_test_instance( "candidate_comments", self.settings.fake.comments) bz_etl.main(self.settings, es, es_comments) ref = elasticsearch.open_test_instance( "reference", self.settings.public_bugs_reference) compare_both(es, ref, self.settings, self.settings.param.bugs) #DIRECT COMPARE THE FILE JSON can = File(self.settings.fake.comments.filename).read() ref = File(self.settings.public_comments_reference.filename).read() if can != ref: found = -1 for i, c in enumerate(can): if can[i] != ref[i]: found = i break Log.error("Comments do not match reference\n{{sample}}", {"sample": can[MIN(0, found - 100):found + 100:]})
def verify_public_bugs(es, private_bugs): #VERIFY BUGS ARE IN OUTPUT for b in private_bugs: versions = compare_es.get_all_bug_versions(es, b) if not versions: Log.error("Expecting versions for public bug {{bug_id}}", {"bug_id": b})
def test_whiteboard_screened(self): GOOD_BUG_TO_TEST = 1046 database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #MARK BUG AS ONE OF THE SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0]) db.flush() #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([ GOOD_BUG_TO_TEST ]) # bug 1046 sees lots of whiteboard, and other field, changes param.allow_private_bugs = True with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST) for v in versions: if v.status_whiteboard not in (None, "", "[screened]"): Log.error("Expecting whiteboard to be screened")
def get_recent_private_comments(db, param): """ GET COMMENTS THAT HAVE HAD THEIR PRIVACY INDICATOR CHANGED """ if param.allow_private_bugs: return [] param.field_id = PRIVATE_COMMENTS_FIELD_ID try: comments = db.query( """ SELECT a.comment_id, a.bug_id FROM bugs_activity a WHERE bug_when >= {{start_time_str}} AND fieldid={{field_id}} """, param) return comments except Exception, e: Log.error("problem getting recent private attachments", e)
def get_comments_by_id(db, comments, param): """ GET SPECIFIC COMMENTS """ if param.allow_private_bugs: return [] param.comments_filter = esfilter2sqlwhere(db, {"and": [ {"term": {"isprivate": 0}}, {"terms": {"c.comment_id": comments}} ]}) try: comments = db.query(""" SELECT c.comment_id, c.bug_id, p.login_name modified_by, UNIX_TIMESTAMP(CONVERT_TZ(bug_when, 'US/Pacific','UTC'))*1000 AS modified_ts, c.thetext comment, c.isprivate FROM longdescs c LEFT JOIN profiles p ON c.who = p.userid LEFT JOIN longdescs_tags t ON t.comment_id=c.comment_id AND t.tag <> 'deleted' WHERE {{comments_filter}} """, param) return comments except Exception, e: Log.error("can not get comment data", e)
def diff(db, table, old_record, new_record): """ UPDATE bugs_activity WITH THE CHANGES IN RECORDS """ now = milli2string(db, CNV.datetime2milli(get_current_time(db))) changed = set(old_record.keys()) ^ set(new_record.keys()) changed |= set([k for k, v in old_record.items() if v != new_record[k]]) if table != u"bugs": prefix = table + u"." else: prefix = u"" for c in changed: fieldid=db.query("SELECT id FROM fielddefs WHERE name={{field_name}}", {"field_name": prefix + c})[0].id if fieldid == None: Log.error("Expecting a valid field name") activity = Struct( bug_id=old_record.bug_id, who=1, bug_when=now, fieldid=fieldid, removed=old_record[c], added=new_record[c], attach_id=old_record.attach_id, comment_id=old_record.comment_id ) db.insert("bugs_activity", activity) db.execute("UPDATE bugs SET delta_ts={{now}} WHERE {{where}}", { "now":now, "where":esfilter2sqlwhere(db, {"term":{"bug_id":old_record.bug_id}}) })
def test_private_etl(self): """ ENSURE IDENTIFIABLE INFORMATION DOES NOT EXIST ON ANY BUGS """ File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() self.settings.param.allow_private_bugs = True database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.fake.bugs) es_comments = elasticsearch.make_test_instance("candidate_comments", self.settings.fake.comments) bz_etl.main(self.settings, es, es_comments) ref = elasticsearch.open_test_instance("reference", self.settings.private_bugs_reference) compare_both(es, ref, self.settings, self.settings.param.bugs) #DIRECT COMPARE THE FILE JSON can = File(self.settings.fake.comments.filename).read() ref = File(self.settings.private_comments_reference.filename).read() if can != ref: for i, c in enumerate(can): found = -1 if can[i] != ref[i]: found = i break Log.error("Comments do not match reference\n{{sample}}", {"sample": can[MIN([0, found - 100]):found + 100]})
def test_public_etl(self): """ ENSURE ETL GENERATES WHAT'S IN THE REFERENCE FILE """ File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() self.settings.param.allow_private_bugs = Null database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.fake.bugs) es_comments = elasticsearch.make_test_instance("candidate_comments", self.settings.fake.comments) bz_etl.main(self.settings, es, es_comments) ref = elasticsearch.open_test_instance("reference", self.settings.public_bugs_reference) compare_both(es, ref, self.settings, self.settings.param.bugs) #DIRECT COMPARE THE FILE JSON can = File(self.settings.fake.comments.filename).read() ref = File(self.settings.public_comments_reference.filename).read() if can != ref: found = -1 for i, c in enumerate(can): if can[i] != ref[i]: found = i break Log.error("Comments do not match reference\n{{sample}}", {"sample": can[MIN(0, found - 100):found + 100:]})
def test_incremental_etl_catches_tracking_flags(self): database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME param.start_time = CNV.datetime2milli(CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S")) param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([813650]) param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, 813650) flags = ["cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18"] for v in versions: if v.modified_ts>param.start_time: for f in flags: if v[f] != "fixed": Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})
def verify_no_private_bugs(es, private_bugs): #VERIFY BUGS ARE NOT IN OUTPUT for b in private_bugs: versions = compare_es.get_all_bug_versions(es, b) if versions: Log.error("Expecting no version for private bug {{bug_id}}", {"bug_id": b})
def tearDown(self): #CLOSE THE CACHED DB CONNECTIONS bz_etl.close_db_connections() if all_db: Log.error("not all db connections are closed") Log.stop()
def verify_public_bugs(es, private_bugs): #VERIFY BUGS ARE IN OUTPUT for b in private_bugs: versions = compare_es.get_all_bug_versions(es, b) if not versions: Log.error("Expecting versions for public bug {{bug_id}}", { "bug_id": b })
def verify_no_private_bugs(es, private_bugs): #VERIFY BUGS ARE NOT IN OUTPUT for b in private_bugs: versions = compare_es.get_all_bug_versions(es, b) if versions: Log.error("Expecting no version for private bug {{bug_id}}", { "bug_id": b })
def verify_no_private_attachments(es, private_attachments): #VERIFY ATTACHMENTS ARE NOT IN OUTPUT for b in Q.select(private_attachments, "bug_id"): versions = compare_es.get_all_bug_versions(es, b) #WE ASSUME THE ATTACHMENT, IF IT EXISTS, WILL BE SOMEWHERE IN THE BUG IT #BELONGS TO, IF AT ALL for v in versions: for a in v.attachments: if a.attach_id in Q.select(private_attachments, "attach_id"): Log.error("Private attachment should not exist")
def get_private_bugs_for_delete(db, param): if param.allow_private_bugs: return {0} # NO BUGS TO DELETE try: with Timer("get all private bug ids"): private_bugs = db.query("SELECT DISTINCT bug_id FROM bug_group_map") return set(private_bugs.bug_id) | {0} except Exception, e: Log.error("problem getting private bugs", e)
def get_private_bugs_for_delete(db, param): if param.allow_private_bugs: return {0} # NO BUGS TO DELETE try: with Timer("get all private bug ids"): private_bugs = db.query( "SELECT DISTINCT bug_id FROM bug_group_map") return set(private_bugs.bug_id) | {0} except Exception, e: Log.error("problem getting private bugs", e)
def run_both_etl(db, output_queue, es_comments, param): comment_thread = Thread.run("etl comments", etl_comments, db, es_comments, param) process_thread = Thread.run("etl", etl, db, output_queue, param) result = comment_thread.join() if result.exception: Log.error("etl_comments had problems", result.exception) result = process_thread.join() if result.exception: Log.error("etl had problems", result.exception)
def test_replication(): try: settings=startup.read_settings(filename="replication_settings.json") Log.start(settings.debug) source=ElasticSearch(settings.source) destination=replicate.get_or_create_index(settings["destination"], source) replicate.replicate(source, destination, [537285], CNV.string2datetime("19900101", "%Y%m%d")) finally: Log.stop()
def extend(self, records): """ JUST SO WE MODEL A Queue """ records = {v["id"]: v["value"] for v in records} struct.unwrap(self.data).update(records) data_as_json = CNV.object2JSON(self.data, pretty=True) File(self.filename).write(data_as_json) Log.note("{{num}} items added", {"num": len(records)})
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File( settings.param.first_run_time).exists and not File( settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es( settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl( settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write( unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def open_test_instance(name, settings): if settings.filename: Log.note("Using {{filename}} as {{type}}", { "filename": settings.filename, "type": name }) return Fake_ES(settings) else: Log.note("Using ES cluster at {{host}} as {{type}}", { "host": settings.host, "type": name }) return ElasticSearch(settings)
def loadAliases(settings): try: try: with Timer("load alias file at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}): alias_json = File(settings.param.alias_file).read() except Exception, e: Log.warning("No alias file found (looking at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}) alias_json = "{}" #self.aliases IS A dict POINTING TO structs for k, v in CNV.JSON2object(alias_json).iteritems(): aliases[k] = struct.wrap(v) Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def test_replication(): try: settings = startup.read_settings(filename="replication_settings.json") Log.start(settings.debug) source = ElasticSearch(settings.source) destination = replicate.get_or_create_index(settings["destination"], source) replicate.replicate(source, destination, [537285], CNV.string2datetime("19900101", "%Y%m%d")) finally: Log.stop()
def get_comments(db, param): if not param.bug_list: return [] if param.allow_private_bugs: param.comment_field = SQL("'[screened]' comment") param.bug_filter = esfilter2sqlwhere( db, {"and": [{ "terms": { "bug_id": param.bug_list } }]}) else: param.comment_field = SQL("c.thetext comment") param.bug_filter = esfilter2sqlwhere( db, { "and": [{ "terms": { "bug_id": param.bug_list } }, { "term": { "isprivate": 0 } }] }) try: comments = db.query( """ SELECT c.comment_id, c.bug_id, p.login_name modified_by, UNIX_TIMESTAMP(CONVERT_TZ(bug_when, 'US/Pacific','UTC'))*1000 AS modified_ts, {{comment_field}}, c.isprivate FROM longdescs c LEFT JOIN profiles p ON c.who = p.userid LEFT JOIN longdescs_tags t ON t.comment_id=c.comment_id AND t.tag <> 'deleted' WHERE {{bug_filter}} AND bug_when >= {{start_time_str}} """, param) return comments except Exception, e: Log.error("can not get comment data", e)
def analysis(settings, last_run, please_stop): DIFF = 7 if last_run: DIFF = 4 #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING try_again = True while try_again and not please_stop: #FIND EMAIL MOST NEEDING REPLACEMENT problem_agg = Multiset(allow_negative=True) for bug_id, agg in bugs.iteritems(): #ONLY COUNT NEGATIVE EMAILS for email, count in agg.dic.iteritems(): if count < 0: problem_agg.add(alias(email), amount=count) problems = Q.sort([ {"email": e, "count": c} for e, c in problem_agg.dic.iteritems() if not aliases.get(e, Null).ignore and (c <= -(DIFF / 2) or last_run) ], ["count", "email"]) try_again = False for problem in problems: if please_stop: break #FIND MOST LIKELY MATCH solution_agg = Multiset(allow_negative=True) for bug_id, agg in bugs.iteritems(): if agg.dic.get(problem.email, 0) < 0: #ONLY BUGS THAT ARE EXPERIENCING THIS problem solution_agg += agg solutions = Q.sort([{"email": e, "count": c} for e, c in solution_agg.dic.iteritems()], [{"field": "count", "sort": -1}, "email"]) if last_run and len(solutions) == 2 and solutions[0].count == -solutions[1].count: #exact match pass elif len(solutions) <= 1 or (solutions[1].count + DIFF >= solutions[0].count): #not distinctive enough continue best_solution = solutions[0] Log.note("{{problem}} ({{score}}) -> {{solution}} {{matches}}", { "problem": problem.email, "score": problem.count, "solution": best_solution.email, "matches": CNV.object2JSON(Q.select(solutions, "count")[:10:]) }) try_again = True add_alias(problem.email, best_solution.email) saveAliases(settings)
def test_confidential_whiteboard_is_screened(self): leaked_whiteboard = get( self.private, { "and": [ { "terms": { "bug_group": SCREENED_WHITEBOARD_BUG_GROUPS } }, { "exists": { "field": "status_whiteboard" } }, { "not": { "terms": { "status_whiteboard": ["", "[screened]"] } } }, { "range": { "expires_on": { "gte": NOW } } }, #CURRENT RECORDS { "range": { "modified_ts": { "lt": A_WHILE_AGO } } }, #OF A MINIMUM AGE ] }, fields=[ "bug_id", "product", "component", "status_whiteboard", "bug_group", "modified_ts" ], limit=100) if leaked_whiteboard: for l in leaked_whiteboard: l.modified_ts = CNV.datetime2string( CNV.milli2datetime(l.modified_ts)) Log.error("Whiteboard leaking:\b{{leak}}", {"leak": leaked_whiteboard})
def etl_comments(db, es, param, please_stop): # CONNECTIONS ARE EXPENSIVE, CACHE HERE with comment_db_cache_lock: if not comment_db_cache: comment_db = DB(db) comment_db_cache.append(comment_db) with comment_db_cache_lock: Log.note("Read comments from database") comments = get_comments(comment_db_cache[0], param) for g, c in Q.groupby(comments, size=500): with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}): es.extend({"id": cc.comment_id, "value": cc} for cc in c)
def make_test_instance(db_settings): if not db_settings.filename: Log.note("Database schema will not be touched") return with Timer("Make database instance"): try: #CLEAR SCHEMA Log.note("Make empty {{schema}} schema", {"schema":db_settings.schema}) no_schema=db_settings.copy() no_schema.schema = None with DB(no_schema) as db: db.execute("DROP DATABASE IF EXISTS {{schema}}", {"schema":db.quote_column(db_settings.schema)}) db.execute("CREATE DATABASE {{schema}}", {"schema":db.quote_column(db_settings.schema)}) #FILL SCHEMA Log.note("Fill {{schema}} schema with data", {"schema":db_settings.schema}) DB.execute_file(db_settings, db_settings.filename) #ADD MISSING TABLES with DB(db_settings) as db: db.execute(""" CREATE TABLE `longdescs_tags` ( `id` mediumint(9) NOT NULL AUTO_INCREMENT, `comment_id` int(11) DEFAULT NULL, `tag` varchar(24) NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `longdescs_tags_idx` (`comment_id`,`tag`), CONSTRAINT `fk_longdescs_tags_comment_id_longdescs_comment_id` FOREIGN KEY (`comment_id`) REFERENCES `longdescs` (`comment_id`) ON DELETE CASCADE ON UPDATE CASCADE ) DEFAULT CHARSET=utf8""") except Exception, e: Log.error("Can not setup test database", e)
def test_incremental_has_correct_expires_on(self): # 813650, 726635 BOTH HAVE CHANGES IN 2013 bugs = struct.wrap([813650, 726635]) start_incremental=CNV.datetime2milli(CNV.string2datetime("2013-01-01", "%Y-%m-%d")) es = elasticsearch.make_test_instance("candidate", self.settings.candidate) with DB(self.settings.bugzilla) as db: #SETUP FIRST RUN PARAMETERS param = Struct() param.end_time = start_incremental param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) #SETUP INCREMENTAL RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(datetime.utcnow()) param.start_time = start_incremental param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) for b in bugs: results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and":[ {"term":{"bug_id":b}}, {"range":{"expires_on":{"gte":CNV.datetime2milli(datetime.utcnow())}}} ]} }}, "from": 0, "size": 200000, "sort": [], "fields": ["bug_id"] }) if results.hits.total>1: Log.error("Expecting only one active bug_version record")
def verify_no_private_comments(es, private_comments): data = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"terms": {"comment_id": private_comments}} ]} }}, "from": 0, "size": 200000, "sort": [] }) if Q.select(data.hits.hits, "_source"): Log.error("Expecting no comments")
def test_private_comments_not_leaking(self): leaked_comments = get( self.public_comments, {"term": {"isprivate": "1"}}, limit=20 ) if leaked_comments: if self.settings.param.delete: self.public_comments.delete_record( {"terms":{"bug_id":leaked_comments.bug_id}} ) Log.error("{{num}} comments marked private have leaked!\n{{comments|indent}}", { "num": len(leaked_comments), "comments": leaked_comments })
def open_test_instance(name, settings): if settings.filename: Log.note("Using {{filename}} as {{type}}", { "filename": settings.filename, "type": name }) return Fake_ES(settings) else: Log.note("Using ES cluster at {{host}} as {{type}}", { "host": settings.host, "type": name }) ElasticSearch.delete_index(settings) schema = CNV.JSON2object(File(settings.schema_file).read(), flexible=True, paths=True) es = ElasticSearch.create_index(settings, schema, limit_replicas=True) return es
def add_alias(lost, found): found_record = aliases.get(found, None) lost_record = aliases.get(lost, None) new_canonical = found old_canonical = nvl(lost_record.canonical, lost) lost_record.canonical = new_canonical delete_list = [] #FOLD bugs ON lost=found for bug_id, agg in bugs.iteritems(): v = agg.dic.get(lost, 0) if v != 0: agg.add(lost, -v) agg.add(found, v) if not agg: delete_list.append(bug_id) #FOLD bugs ON old_canonical=new_canonical if old_canonical != lost: for bug_id, agg in bugs.iteritems(): v = agg.dic.get(old_canonical, 0) if v != 0: agg.add(old_canonical, -v) agg.add(new_canonical, v) if not agg: delete_list.append(bug_id) for d in delete_list: del bugs[d] #FOLD ALIASES for k, v in aliases.iteritems(): if v.canonical == old_canonical: Log.note( "ALIAS REMAPPED: {{alias}}->{{old}} to {{alias}}->{{new}}", { "alias": k, "old": old_canonical, "new": found }) v.canonical = found
def extract_from_file(source_settings, destination): with File(source_settings.filename) as handle: for g, d in Q.groupby(handle, size=BATCH_SIZE): try: d2 = map( lambda (x): {"id": x.id, "value": x}, map( lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))), d ) ) destination.add(d2) except Exception, e: filename = "Error_" + unicode(g) + ".txt" File(filename).write(d) Log.warning("Can not convert block {{block}} (file={{host}})", { "block": g, "filename": filename }, e)
def get_or_create_index(destination_settings, source): #CHECK IF INDEX, OR ALIAS, EXISTS es = ElasticSearch(destination_settings) aliases = es.get_aliases() indexes = [a for a in aliases if a.alias == destination_settings.index] if not indexes: #CREATE INDEX schema = source.get_schema() assert schema.settings assert schema.mappings ElasticSearch.create_index(destination_settings, schema, limit_replicas=True) elif len(indexes) > 1: Log.error("do not know how to replicate to more than one index") elif indexes[0].alias != None: destination_settings.alias = destination_settings.index destination_settings.index = indexes[0].index return ElasticSearch(destination_settings)
def test_private_comments_not_leaking(self): leaked_comments = get(self.public_comments, {"term": { "isprivate": "1" }}, limit=20) if leaked_comments: if self.settings.param.delete: self.public_comments.delete_record( {"terms": { "bug_id": leaked_comments.bug_id }}) Log.error( "{{num}} comments marked private have leaked!\n{{comments|indent}}", { "num": len(leaked_comments), "comments": leaked_comments })
def get_comments_by_id(db, comments, param): """ GET SPECIFIC COMMENTS """ if param.allow_private_bugs: return [] param.comments_filter = esfilter2sqlwhere(db, { "and": [{ "term": { "isprivate": 0 } }, { "terms": { "c.comment_id": comments } }] }) try: comments = db.query( """ SELECT c.comment_id, c.bug_id, p.login_name modified_by, UNIX_TIMESTAMP(CONVERT_TZ(bug_when, 'US/Pacific','UTC'))*1000 AS modified_ts, c.thetext comment, c.isprivate FROM longdescs c LEFT JOIN profiles p ON c.who = p.userid LEFT JOIN longdescs_tags t ON t.comment_id=c.comment_id AND t.tag <> 'deleted' WHERE {{comments_filter}} """, param) return comments except Exception, e: Log.error("can not get comment data", e)
def compare_both(candidate, reference, settings, some_bugs): File(settings.param.errors).delete() try_dir = settings.param.errors + "/try/" ref_dir = settings.param.errors + "/ref/" with Timer("Comparing to reference"): found_errors = False for bug_id in some_bugs: try: versions = Q.sort( get_all_bug_versions(candidate, bug_id, datetime.utcnow()), "modified_ts") # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE if not versions: max_time = CNV.milli2datetime(settings.bugzilla.expires_on) else: max_time = CNV.milli2datetime(versions.last().modified_ts) pre_ref_versions = get_all_bug_versions( reference, bug_id, max_time) ref_versions = \ Q.sort( #ADDED TO FIX OLD PRODUCTION BUG VERSIONS [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions], "modified_ts" ) can = CNV.object2JSON(versions, pretty=True) ref = CNV.object2JSON(ref_versions, pretty=True) if can != ref: found_errors = True File(try_dir + unicode(bug_id) + ".txt").write(can) File(ref_dir + unicode(bug_id) + ".txt").write(ref) except Exception, e: found_errors = True Log.warning("Problem ETL'ing bug {{bug_id}}", {"bug_id": bug_id}, e) if found_errors: Log.error("DIFFERENCES FOUND (Differences shown in {{path}})", {"path": [try_dir, ref_dir]})
def main(settings): file = File(settings.param.alias_file) aliases = CNV.JSON2object(file.read()) for v in aliases.values(): v.candidates = CNV.dict2Multiset(v.candidates) data = [{ "lost": n, "found": d.canonical } for n, d in aliases.items() if d.canonical != None and n != d.canonical] sorted = Q.sort(data, "found") for s in sorted: Log.note("{{found}} == {{lost}}", s) clean = { n: d.canonical for n, d in aliases.items() if d.canonical != None and n != d.canonical and n != "" } rev_clean = struct.inverse(clean) Log.note(CNV.object2JSON(rev_clean, pretty=True)) for k, v in rev_clean.items(): if len(v) > 3: Log.note(CNV.object2JSON({k: v}, pretty=True))