def verify_no_private_attachments(es, private_attachments): #VERIFY ATTACHMENTS ARE NOT IN OUTPUT for b in Q.select(private_attachments, "bug_id"): versions = compare_es.get_all_bug_versions(es, b) #WE ASSUME THE ATTACHMENT, IF IT EXISTS, WILL BE SOMEWHERE IN THE BUG IT #BELONGS TO, IF AT ALL for v in versions: for a in v.attachments: if a.attach_id in Q.select(private_attachments, "attach_id"): Log.error("Private attachment should not exist")
def incremental_etl(settings, param, db, es, es_comments, output_queue): #################################################################### ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS #################################################################### #REMOVE PRIVATE BUGS private_bugs = get_private_bugs_for_delete(db, param) Log.note( "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": private_bugs}) for g, delete_bugs in Q.groupby(private_bugs, size=1000): still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}}) if still_existing: Log.note( "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": still_existing}) es.delete_record({"terms": {"bug_id": delete_bugs}}) es_comments.delete_record({"terms": {"bug_id": delete_bugs}}) #RECENT PUBLIC BUGS possible_public_bugs = get_recent_private_bugs(db, param) if param.allow_private_bugs: #PRIVATE BUGS # A CHANGE IN PRIVACY INDICATOR MEANS THE WHITEBOARD IS AFFECTED, REDO es.delete_record({"terms": {"bug_id": possible_public_bugs}}) else: #PUBLIC BUGS # IF ADDING GROUP THEN private_bugs ALREADY DID THIS # IF REMOVING GROUP THEN NO RECORDS TO DELETE pass #REMOVE **RECENT** PRIVATE ATTACHMENTS private_attachments = get_recent_private_attachments(db, param) bugs_to_refresh = set(Q.select(private_attachments, "bug_id")) es.delete_record({"terms": {"bug_id": bugs_to_refresh}}) #REBUILD BUGS THAT GOT REMOVED bug_list = (possible_public_bugs | bugs_to_refresh) - private_bugs # REMOVE PRIVATE BUGS if bug_list: refresh_param = param.copy() refresh_param.bug_list = bug_list refresh_param.start_time = 0 refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0) try: etl(db, output_queue, refresh_param.copy(), please_stop=None) etl_comments(db, es_comments, refresh_param.copy(), please_stop=None) except Exception, e: Log.error("Problem with etl using parameters {{parameters}}", {"parameters": refresh_param}, e)
def analysis(settings, last_run, please_stop): DIFF = 7 if last_run: DIFF = 4 #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING try_again = True while try_again and not please_stop: #FIND EMAIL MOST NEEDING REPLACEMENT problem_agg = Multiset(allow_negative=True) for bug_id, agg in bugs.iteritems(): #ONLY COUNT NEGATIVE EMAILS for email, count in agg.dic.iteritems(): if count < 0: problem_agg.add(alias(email), amount=count) problems = Q.sort([ {"email": e, "count": c} for e, c in problem_agg.dic.iteritems() if not aliases.get(e, Null).ignore and (c <= -(DIFF / 2) or last_run) ], ["count", "email"]) try_again = False for problem in problems: if please_stop: break #FIND MOST LIKELY MATCH solution_agg = Multiset(allow_negative=True) for bug_id, agg in bugs.iteritems(): if agg.dic.get(problem.email, 0) < 0: #ONLY BUGS THAT ARE EXPERIENCING THIS problem solution_agg += agg solutions = Q.sort([{"email": e, "count": c} for e, c in solution_agg.dic.iteritems()], [{"field": "count", "sort": -1}, "email"]) if last_run and len(solutions) == 2 and solutions[0].count == -solutions[1].count: #exact match pass elif len(solutions) <= 1 or (solutions[1].count + DIFF >= solutions[0].count): #not distinctive enough continue best_solution = solutions[0] Log.note("{{problem}} ({{score}}) -> {{solution}} {{matches}}", { "problem": problem.email, "score": problem.count, "solution": best_solution.email, "matches": CNV.object2JSON(Q.select(solutions, "count")[:10:]) }) try_again = True add_alias(problem.email, best_solution.email) saveAliases(settings)
def incremental_etl(settings, param, db, es, es_comments, output_queue): #################################################################### ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS #################################################################### #REMOVE PRIVATE BUGS private_bugs = get_private_bugs_for_delete(db, param) Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": private_bugs}) for g, delete_bugs in Q.groupby(private_bugs, size=1000): still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}}) if still_existing: Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": still_existing}) es.delete_record({"terms": {"bug_id": delete_bugs}}) es_comments.delete_record({"terms": {"bug_id": delete_bugs}}) #RECENT PUBLIC BUGS possible_public_bugs = get_recent_private_bugs(db, param) if param.allow_private_bugs: #PRIVATE BUGS # A CHANGE IN PRIVACY INDICATOR MEANS THE WHITEBOARD IS AFFECTED, REDO es.delete_record({"terms": {"bug_id": possible_public_bugs}}) else: #PUBLIC BUGS # IF ADDING GROUP THEN private_bugs ALREADY DID THIS # IF REMOVING GROUP THEN NO RECORDS TO DELETE pass #REMOVE **RECENT** PRIVATE ATTACHMENTS private_attachments = get_recent_private_attachments(db, param) bugs_to_refresh = set(Q.select(private_attachments, "bug_id")) es.delete_record({"terms": {"bug_id": bugs_to_refresh}}) #REBUILD BUGS THAT GOT REMOVED bug_list = (possible_public_bugs | bugs_to_refresh) - private_bugs # REMOVE PRIVATE BUGS if bug_list: refresh_param = param.copy() refresh_param.bug_list = bug_list refresh_param.start_time = 0 refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0) try: etl(db, output_queue, refresh_param.copy(), please_stop=None) etl_comments(db, es_comments, refresh_param.copy(), please_stop=None) except Exception, e: Log.error("Problem with etl using parameters {{parameters}}", { "parameters": refresh_param }, e)
def main(settings): file = File(settings.param.alias_file) aliases = CNV.JSON2object(file.read()) for v in aliases.values(): v.candidates = CNV.dict2Multiset(v.candidates) data = [{ "lost": n, "found": d.canonical } for n, d in aliases.items() if d.canonical != None and n != d.canonical] sorted = Q.sort(data, "found") for s in sorted: Log.note("{{found}} == {{lost}}", s) clean = { n: d.canonical for n, d in aliases.items() if d.canonical != None and n != d.canonical and n != "" } rev_clean = struct.inverse(clean) Log.note(CNV.object2JSON(rev_clean, pretty=True)) for k, v in rev_clean.items(): if len(v) > 3: Log.note(CNV.object2JSON({k: v}, pretty=True))
def main(settings): file = File(settings.param.alias_file) aliases = CNV.JSON2object(file.read()) for v in aliases.values(): v.candidates = CNV.dict2Multiset(v.candidates) data = [ { "lost": n, "found": d.canonical } for n, d in aliases.items() if d.canonical != None and n != d.canonical ] sorted = Q.sort(data, "found") for s in sorted: Log.note("{{found}} == {{lost}}", s) clean = { n: d.canonical for n, d in aliases.items() if d.canonical != None and n != d.canonical and n != "" } rev_clean = struct.inverse(clean) Log.note(CNV.object2JSON(rev_clean, pretty=True)) for k, v in rev_clean.items(): if len(v) > 3: Log.note(CNV.object2JSON({k: v}, pretty=True))
def blocks_of_bugs(self): max_bug_id = self.private.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "match_all": {} }] } } }, "from": 0, "size": 0, "sort": [], "facets": { "0": { "statistical": { "field": "bug_id" } } } }).facets["0"].max return reversed( list(Q.intervals(0, max_bug_id, self.settings.param.increment)))
def replicate(source, destination, pending, last_updated): """ COPY source RECORDS TO destination """ for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE): with Timer("Replicate {{num_bugs}} bug versions", {"num_bugs": len(bugs)}): data = source.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"terms": {"bug_id": set(bugs)}}, {"range": {"modified_ts": {"gte": CNV.datetime2milli(last_updated)} }} ]} }}, "from": 0, "size": 200000, "sort": [] }) d2 = map( lambda(x): {"id": x.id, "value": x}, map( lambda(x): transform_bugzilla.normalize(transform_bugzilla.rename_attachments(x._source), old_school=True), data.hits.hits ) ) destination.extend(d2)
def get(es, esfilter, fields=None, limit=None): query = struct.wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": esfilter }}, "from": 0, "size": nvl(limit, 200000), "sort": [] }) if fields: query.fields=fields results = es.search(query) return Q.select(results.hits.hits, "fields") else: results = es.search(query) return Q.select(results.hits.hits, "_source")
def aggregator(data): """ FLATTEN CC LISTS OVER TIME BY BUG MULTISET COUNTS THE NUMBER OF EMAIL AT BUG CREATION NEGATIVE MEANS THERE WAS AN ADD WITHOUT A REMOVE (AND NOT IN CURRENT LIST) """ for d in data: new_emails = Q.map2set(split_email(d.new_value), alias) old_emails = Q.map2set(split_email(d.old_value), alias) for e in new_emails | old_emails: details = aliases.get(e, Struct()) aliases[e] = details agg = bugs.get(d.bug_id, Multiset(allow_negative=True)) agg = agg - new_emails agg = agg + old_emails bugs[d.bug_id] = agg
def get_screened_whiteboard(db): if not SCREENED_BUG_GROUP_IDS: groups = db.query( "SELECT id FROM groups WHERE {{where}}", { "where": esfilter2sqlwhere( db, {"terms": { "name": SCREENED_WHITEBOARD_BUG_GROUPS }}) }) globals()["SCREENED_BUG_GROUP_IDS"] = Q.select(groups, "id")
def compare_both(candidate, reference, settings, some_bugs): File(settings.param.errors).delete() try_dir = settings.param.errors + "/try/" ref_dir = settings.param.errors + "/ref/" with Timer("Comparing to reference"): found_errors = False for bug_id in some_bugs: try: versions = Q.sort( get_all_bug_versions(candidate, bug_id, datetime.utcnow()), "modified_ts") # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE if not versions: max_time = CNV.milli2datetime(settings.bugzilla.expires_on) else: max_time = CNV.milli2datetime(versions.last().modified_ts) pre_ref_versions = get_all_bug_versions( reference, bug_id, max_time) ref_versions = \ Q.sort( #ADDED TO FIX OLD PRODUCTION BUG VERSIONS [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions], "modified_ts" ) can = CNV.object2JSON(versions, pretty=True) ref = CNV.object2JSON(ref_versions, pretty=True) if can != ref: found_errors = True File(try_dir + unicode(bug_id) + ".txt").write(can) File(ref_dir + unicode(bug_id) + ".txt").write(ref) except Exception, e: found_errors = True Log.warning("Problem ETL'ing bug {{bug_id}}", {"bug_id": bug_id}, e) if found_errors: Log.error("DIFFERENCES FOUND (Differences shown in {{path}})", {"path": [try_dir, ref_dir]})
def compare_both(candidate, reference, settings, some_bugs): File(settings.param.errors).delete() try_dir = settings.param.errors + "/try/" ref_dir = settings.param.errors + "/ref/" with Timer("Comparing to reference"): found_errors = False for bug_id in some_bugs: try: versions = Q.sort( get_all_bug_versions(candidate, bug_id, datetime.utcnow()), "modified_ts") # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE if not versions: max_time = CNV.milli2datetime(settings.bugzilla.expires_on) else: max_time = CNV.milli2datetime(versions.last().modified_ts) pre_ref_versions = get_all_bug_versions(reference, bug_id, max_time) ref_versions = \ Q.sort( #ADDED TO FIX OLD PRODUCTION BUG VERSIONS [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions], "modified_ts" ) can = CNV.object2JSON(versions, pretty=True) ref = CNV.object2JSON(ref_versions, pretty=True) if can != ref: found_errors = True File(try_dir + unicode(bug_id) + ".txt").write(can) File(ref_dir + unicode(bug_id) + ".txt").write(ref) except Exception, e: found_errors = True Log.warning("Problem ETL'ing bug {{bug_id}}", {"bug_id": bug_id}, e) if found_errors: Log.error("DIFFERENCES FOUND (Differences shown in {{path}})", { "path": [try_dir, ref_dir]} )
def get(es, esfilter, fields=None, limit=None): query = struct.wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": esfilter } }, "from": 0, "size": nvl(limit, 200000), "sort": [] }) if fields: query.fields = fields results = es.search(query) return Q.select(results.hits.hits, "fields") else: results = es.search(query) return Q.select(results.hits.hits, "_source")
def blocks_of_bugs(self): max_bug_id = self.private.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [{"match_all": {}}]} }}, "from": 0, "size": 0, "sort": [], "facets": {"0": {"statistical": {"field": "bug_id"}}} }).facets["0"].max return reversed(list(Q.intervals(0, max_bug_id, self.settings.param.increment)))
def etl_comments(db, es, param, please_stop): # CONNECTIONS ARE EXPENSIVE, CACHE HERE with comment_db_cache_lock: if not comment_db_cache: comment_db = DB(db) comment_db_cache.append(comment_db) with comment_db_cache_lock: Log.note("Read comments from database") comments = get_comments(comment_db_cache[0], param) for g, c in Q.groupby(comments, size=500): with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}): es.extend({"id": cc.comment_id, "value": cc} for cc in c)
def verify_no_private_comments(es, private_comments): data = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"terms": {"comment_id": private_comments}} ]} }}, "from": 0, "size": 200000, "sort": [] }) if Q.select(data.hits.hits, "_source"): Log.error("Expecting no comments")
def get_all_bug_versions(es, bug_id, max_time=None): max_time = nvl(max_time, datetime.max) data = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"term": {"bug_id": bug_id}}, {"range": {"modified_ts": {"lte": CNV.datetime2milli(max_time)}}} ]} }}, "from": 0, "size": 200000, "sort": [] }) return Q.select(data.hits.hits, "_source")
def old2new(bug, max_date): """ CONVERT THE OLD ES FORMAT TO THE NEW THESE ARE KNOWN CHANGES THAT SHOULD BE MADE TO THE PRODUCTION VERSION """ if bug.everconfirmed != None: if bug.everconfirmed == "": bug.everconfirmed = None else: bug.everconfirmed = int(bug.everconfirmed) bug = CNV.JSON2object(CNV.object2JSON(bug).replace("bugzilla: other b.m.o issues ", "bugzilla: other b.m.o issues")) if bug.expires_on > max_date: bug.expires_on = parse_bug_history.MAX_TIME if bug.votes != None: bug.votes = int(bug.votes) bug.dupe_by = CNV.value2intlist(bug.dupe_by) if bug.votes == 0: del bug["votes"] # if Math.is_integer(bug.remaining_time) and int(bug.remaining_time) == 0: # bug.remaining_time = 0 if bug.cf_due_date != None and not Math.is_number(bug.cf_due_date): bug.cf_due_date = CNV.datetime2milli( CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d") ) bug.changes = CNV.JSON2object( CNV.object2JSON(Q.sort(bug.changes, "field_name")) \ .replace("\"field_value_removed\":", "\"old_value\":") \ .replace("\"field_value\":", "\"new_value\":") ) if bug.everconfirmed == 0: del bug["everconfirmed"] if bug.id == "692436_1336314345": bug.votes = 3 try: if Math.is_number(bug.cf_last_resolved): bug.cf_last_resolved = long(bug.cf_last_resolved) else: bug.cf_last_resolved = CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S")) except Exception, e: pass
def extract_from_file(source_settings, destination): with File(source_settings.filename) as handle: for g, d in Q.groupby(handle, size=BATCH_SIZE): try: d2 = map( lambda (x): {"id": x.id, "value": x}, map( lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))), d ) ) destination.add(d2) except Exception, e: filename = "Error_" + unicode(g) + ".txt" File(filename).write(d) Log.warning("Can not convert block {{block}} (file={{host}})", { "block": g, "filename": filename }, e)
def etl(db, output_queue, param, please_stop): """ PROCESS RANGE, AS SPECIFIED IN param AND PUSH BUG VERSION RECORDS TO output_queue """ # CONNECTIONS ARE EXPENSIVE, CACHE HERE with db_cache_lock: if not db_cache: with Timer("open connections to db"): for f in get_stuff_from_bugzilla: db = DB(db) db_cache.append(db) db_results = Queue(max=2**30) with db_cache_lock: # ASYMMETRIC MULTI THREADING TO GET RECORDS FROM DB with AllThread() as all: for i, f in enumerate(get_stuff_from_bugzilla): def process(target, db, param, please_stop): db_results.extend(target(db, param)) all.add(process, f, db_cache[i], param.copy()) db_results.add(Thread.STOP) sorted = Q.sort(db_results, [ "bug_id", "_merge_order", { "field": "modified_ts", "sort": -1 }, "modified_by" ]) process = BugHistoryParser(param, output_queue) for s in sorted: process.processRow(s) process.processRow( struct.wrap({ "bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1 }))
def milli2datetime(r): """ CONVERT ANY longs INTO TIME STRINGS """ try: if r == None: return None elif isinstance(r, basestring): return r elif Math.is_number(r): if CNV.value2number(r) > 800000000000: return CNV.datetime2string(CNV.milli2datetime(r), "%Y-%m-%d %H:%M:%S") else: return r elif isinstance(r, dict): output = {} for k, v in r.items(): v = milli2datetime(v) if v != None: output[k.lower()] = v return output elif hasattr(r, '__iter__'): output = [] for v in r: v = milli2datetime(v) if v != None: output.append(v) if not output: return None try: return Q.sort(output) except Exception: return output else: return r except Exception, e: Log.warning("Can not scrub: {{json}}", {"json": r}, e)
def main(settings, bug_list=None, please_stop=None, restart=False): """ THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE. SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME. THIS CODE WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED OVER THE LIFETIME OF THE BUGZILLA DATA. 'PERSON' IS ABSTRACT, AND SIMPLY ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION """ if settings.args.quick: Log.note("Alias analysis skipped (--quick was used)") return if not restart: loadAliases(settings) if bug_list: with DB(settings.bugzilla, readonly=True) as db: data = get_all_cc_changes(db, bug_list) aggregator(data) analysis(settings, True, please_stop) return with DB(settings.bugzilla, readonly=True) as db: start = nvl(settings.param.start, 0) end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) #Perform analysis on blocks of bugs, in case we crash partway through for s, e in Q.intervals(start, end, settings.param.alias_increment): Log.note("Load range {{start}}-{{end}}", { "start": s, "end": e }) data = get_all_cc_changes(db, range(s, e)) if please_stop: break aggregator(data) analysis(settings, e >= end, please_stop)
def verify_no_private_comments(es, private_comments): data = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "terms": { "comment_id": private_comments } }] } } }, "from": 0, "size": 200000, "sort": [] }) if Q.select(data.hits.hits, "_source"): Log.error("Expecting no comments")
def main(settings, bug_list=None, please_stop=None, restart=False): """ THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE. SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME. THIS CODE WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED OVER THE LIFETIME OF THE BUGZILLA DATA. 'PERSON' IS ABSTRACT, AND SIMPLY ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION """ if settings.args.quick: Log.note("Alias analysis skipped (--quick was used)") return if not restart: loadAliases(settings) if bug_list: with DB(settings.bugzilla, readonly=True) as db: data = get_all_cc_changes(db, bug_list) aggregator(data) analysis(settings, True, please_stop) return with DB(settings.bugzilla, readonly=True) as db: start = nvl(settings.param.start, 0) end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) #Perform analysis on blocks of bugs, in case we crash partway through for s, e in Q.intervals(start, end, settings.param.alias_increment): Log.note("Load range {{start}}-{{end}}", {"start": s, "end": e}) data = get_all_cc_changes(db, range(s, e)) if please_stop: break aggregator(data) analysis(settings, e >= end, please_stop)
def etl(db, output_queue, param, please_stop): """ PROCESS RANGE, AS SPECIFIED IN param AND PUSH BUG VERSION RECORDS TO output_queue """ # CONNECTIONS ARE EXPENSIVE, CACHE HERE with db_cache_lock: if not db_cache: with Timer("open connections to db"): for f in get_stuff_from_bugzilla: db = DB(db) db_cache.append(db) db_results = Queue(max=2**30) with db_cache_lock: # ASYMMETRIC MULTI THREADING TO GET RECORDS FROM DB with AllThread() as all: for i, f in enumerate(get_stuff_from_bugzilla): def process(target, db, param, please_stop): db_results.extend(target(db, param)) all.add(process, f, db_cache[i], param.copy()) db_results.add(Thread.STOP) sorted = Q.sort(db_results, [ "bug_id", "_merge_order", {"field": "modified_ts", "sort": -1}, "modified_by" ]) process = BugHistoryParser(param, output_queue) for s in sorted: process.processRow(s) process.processRow(struct.wrap({"bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1}))
def test_changes_to_private_bugs_still_have_bug_group(self): self.settings.param.allow_private_bugs = True File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs}) database.make_test_instance(self.settings.bugzilla) #MARK SOME BUGS PRIVATE with DB(self.settings.bugzilla) as db: for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) # MAKE A CHANGE TO THE PRIVATE BUGS with DB(self.settings.bugzilla) as db: for b in private_bugs: old_bug = db.query("SELECT * FROM bugs WHERE bug_id={{bug_id}}", {"bug_id": b})[0] new_bug = old_bug.copy() new_bug.bug_status = "NEW STATUS" diff(db, "bugs", old_bug, new_bug) #RUN INCREMENTAL bz_etl.main(self.settings, es, es_c) #VERIFY BUG GROUP STILL EXISTS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING now = datetime.utcnow() results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"terms": {"bug_id": private_bugs}}, {"range": {"expires_on": {"gte": CNV.datetime2milli(now)}}} ]} }}, "from": 0, "size": 200000, "sort": [] }) latest_bugs = Q.select(results.hits.hits, "_source") latest_bugs_index = Q.unique_index(latest_bugs, "bug_id") # IF NOT UNIQUE, THEN ETL IS WRONG for bug_id in private_bugs: if latest_bugs_index[bug_id] == None: Log.error("Expecting to find the private bug {{bug_id}}", {"bug_id": bug_id}) bug_group = latest_bugs_index[bug_id].bug_group if not bug_group: Log.error("Expecting private bug ({{bug_id}}) to have a bug group", {"bug_id": bug_id}) if BUG_GROUP_FOR_TESTING not in bug_group: Log.error("Expecting private bug ({{bug_id}}) to have a \"{{bug_group}}\" bug group", { "bug_id": bug_id, "bug_group": BUG_GROUP_FOR_TESTING })
def test_private_bugs_not_leaking(self): bad_news = False # FOR ALL BUG BLOCKS for min_id, max_id in self.blocks_of_bugs(): results = get( self.private, {"and": [ {"match_all": {}}, {"and": [ {"range": {"bug_id": {"gte": min_id, "lt": max_id}}}, {"exists": {"field": "bug_group"}}, {"range": {"expires_on": {"gte": NOW}}}, #CURRENT RECORDS {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE ]} ]}, ["bug_id", "bug_group", "modified_ts"] ) private_ids = {b.bug_id: b.bug_group for b in results} Log.note("Ensure {{num}} bugs did not leak", { "num": len(private_ids.keys()) }) # VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, {"and": [ {"terms": {"bug_id": private_ids.keys()}}, {"range": {"expires_on": {"gte": NOW}}} # SOME BUGS WILL LEAK FOR A LITTLE WHILE ]} ) if leaked_bugs: bad_news = True if self.settings.param.delete: self.public.delete_record( {"terms":{"bug_id":leaked_bugs.bug_id}} ) Log.note("{{num}} leaks!! {{bugs}}", { "num": len(leaked_bugs), "bugs": Q.run({ "from":leaked_bugs, "select":["bug_id", "bug_version_num", {"name":"modified_ts", "value":lambda d: CNV.datetime2string(CNV.milli2datetime(d.modified_ts))}], "sort":"bug_id" }) }) for b in leaked_bugs: Log.note("{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}", { "bug_id": b.bug_id, "bug_group": private_ids[b.bug_id], "version": milli2datetime(b) }) #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG leaked_comments = get( self.public_comments, {"terms": {"bug_id": private_ids.keys()}}, limit=20 ) if leaked_comments: bad_news = True if self.settings.param.delete: self.public_comments.delete_record( {"terms":{"bug_id":leaked_comments.bug_id}} ) Log.warning("{{num}} comments marked private have leaked!\n{{comments|indent}}", { "num": len(leaked_comments), "comments": leaked_comments }) if bad_news: Log.error("Bugs have leaked!")
def analysis(settings, last_run, please_stop): DIFF = 7 if last_run: DIFF = 4 #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING try_again = True while try_again and not please_stop: #FIND EMAIL MOST NEEDING REPLACEMENT problem_agg = Multiset(allow_negative=True) for bug_id, agg in bugs.iteritems(): #ONLY COUNT NEGATIVE EMAILS for email, count in agg.dic.iteritems(): if count < 0: problem_agg.add(alias(email), amount=count) problems = Q.sort([{ "email": e, "count": c } for e, c in problem_agg.dic.iteritems() if not aliases.get(e, Null).ignore and (c <= -(DIFF / 2) or last_run)], ["count", "email"]) try_again = False for problem in problems: if please_stop: break #FIND MOST LIKELY MATCH solution_agg = Multiset(allow_negative=True) for bug_id, agg in bugs.iteritems(): if agg.dic.get( problem.email, 0) < 0: #ONLY BUGS THAT ARE EXPERIENCING THIS problem solution_agg += agg solutions = Q.sort([{ "email": e, "count": c } for e, c in solution_agg.dic.iteritems()], [{ "field": "count", "sort": -1 }, "email"]) if last_run and len(solutions) == 2 and solutions[ 0].count == -solutions[1].count: #exact match pass elif len(solutions) <= 1 or (solutions[1].count + DIFF >= solutions[0].count): #not distinctive enough continue best_solution = solutions[0] Log.note( "{{problem}} ({{score}}) -> {{solution}} {{matches}}", { "problem": problem.email, "score": problem.count, "solution": best_solution.email, "matches": CNV.object2JSON( Q.select(solutions, "count")[:10:]) }) try_again = True add_alias(problem.email, best_solution.email) saveAliases(settings)
def test_private_attachments_not_leaking(self): for min_id, max_id in self.blocks_of_bugs(): # FIND ALL PRIVATE ATTACHMENTS bugs_w_private_attachments = get( self.private, {"and": [ {"range": {"bug_id": {"gte": min_id, "lt": max_id}}}, {"range": {"expires_on": {"gte": NOW}}}, #CURRENT RECORDS {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE {"nested": { #HAS ATTACHMENT. "path": "attachments", "query": {"filtered": { "query": {"match_all": {}}, "filter": {"exists": {"field":"attachments.attach_id"}} }} }}, {"or":[ {"nested": { #PRIVATE ATTACHMENT, OR... "path": "attachments", "query": {"filtered": { "query": {"match_all": {}}, "filter": {"term": {"attachments.isprivate": 1}} }} }}, {"exists":{"field":"bug_group"}} # ...PRIVATE BUG ]} ]}, fields=["bug_id", "bug_group", "attachments", "modified_ts"] ) private_attachments = Q.run({ "from": bugs_w_private_attachments, "select": "attachments.attach_id", "where": {"or": [ {"exists": "bug_group"}, {"terms": {"attachments.isprivate": ['1', True, 1]}} ]} }) try: private_attachments = [int(v) for v in private_attachments] except Exception, e: private_attachments = Q.run({ "from": bugs_w_private_attachments, "select": "attachments.attach_id", "where": {"or": [ {"exists": "bug_group"}, {"terms": {"attachments.isprivate": ['1', True, 1]}} ]} }) Log.note("Ensure {{num}} attachments did not leak", { "num": len(private_attachments) }) #VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, {"and": [ {"range": {"bug_id": {"gte": min_id, "lt": max_id}}}, {"range": {"expires_on": {"gte": NOW}}}, # CURRENT BUGS {"nested": { "path": "attachments", "query": {"filtered": { "query": {"match_all": {}}, "filter": {"terms": {"attach_id": private_attachments}} }} }} ]} # fields=["bug_id", "attachments"] ) # if leaked_bugs: if self.settings.param.delete: self.public.delete_record( {"terms":{"bug_id":leaked_bugs.bug_id}} ) Log.note("{{num}} bugs with private attachments have leaked!", {"num": len(leaked_bugs)}) for b in leaked_bugs: Log.note("{{bug_id}} has private_attachment\n{{version|indent}}", { "bug_id": b.bug_id, "version": b }) Log.error("Attachments have leaked!")
def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue): with Thread.run("alias_analysis", alias_analysis.main, settings=settings): end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) start = nvl(settings.param.start, 0) if resume_from_last_run: start = nvl( settings.param.start, Math.floor(get_max_bug_id(es), settings.param.increment)) ############################################################# ## MAIN ETL LOOP ############################################################# #TWO WORKERS IS MORE THAN ENOUGH FOR A SINGLE THREAD # with Multithread([run_both_etl, run_both_etl]) as workers: for min, max in Q.intervals(start, end, settings.param.increment): if settings.args.quick and min < end - settings.param.increment and min != 0: #--quick ONLY DOES FIRST AND LAST BLOCKS continue try: #GET LIST OF CHANGED BUGS with Timer("time to get {{min}}..{{max}} bug list", { "min": min, "max": max }): if param.allow_private_bugs: bug_list = Q.select( db.query( """ SELECT b.bug_id FROM bugs b WHERE delta_ts >= {{start_time_str}} AND ({{min}} <= b.bug_id AND b.bug_id < {{max}}) """, { "min": min, "max": max, "start_time_str": param.start_time_str }), u"bug_id") else: bug_list = Q.select( db.query( """ SELECT b.bug_id FROM bugs b LEFT JOIN bug_group_map m ON m.bug_id=b.bug_id WHERE delta_ts >= {{start_time_str}} AND ({{min}} <= b.bug_id AND b.bug_id < {{max}}) AND m.bug_id IS NULL """, { "min": min, "max": max, "start_time_str": param.start_time_str }), u"bug_id") if not bug_list: continue param.bug_list = bug_list run_both_etl( **{ "db": db, "output_queue": output_queue, "es_comments": es_comments, "param": param.copy() }) except Exception, e: Log.error( "Problem with dispatch loop in range [{{min}}, {{max}})", { "min": min, "max": max }, e)
def normalize(bug, old_school=False): bug=bug.copy() bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3] bug._id = None #ENSURE STRUCTURES ARE SORTED # Do some processing to make sure that diffing between runs stays as similar as possible. bug.flags=Q.sort(bug.flags, "value") if bug.attachments: if USE_ATTACHMENTS_DOT: bug.attachments=CNV.JSON2object(CNV.object2JSON(bug.attachments).replace("attachments_", "attachments.")) bug.attachments = Q.sort(bug.attachments, "attach_id") for a in bug.attachments: for k,v in list(a.items()): if k.startswith("attachments") and (k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate")): new_v=CNV.value2int(v) new_k=k[12:] a[k.replace(".", "\.")]=new_v if not old_school: a[new_k]=new_v a.flags = Q.sort(a.flags, ["modified_ts", "value"]) if bug.changes != None: if USE_ATTACHMENTS_DOT: json = CNV.object2JSON(bug.changes).replace("attachments_", "attachments.") bug.changes=CNV.JSON2object(json) bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"]) #bug IS CONVERTED TO A 'CLEAN' COPY bug = ElasticSearch.scrub(bug) # bug.attachments = nvl(bug.attachments, []) # ATTACHMENTS MUST EXIST for f in NUMERIC_FIELDS: v = bug[f] if v == None: continue elif f in MULTI_FIELDS: bug[f] = CNV.value2intlist(v) elif CNV.value2number(v) == 0: del bug[f] else: bug[f]=CNV.value2number(v) # Also reformat some date fields for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]: v = bug[dateField] if v == None: continue try: if isinstance(v, date): bug[dateField] = CNV.datetime2milli(v) elif isinstance(v, long) and len(unicode(v)) in [12, 13]: bug[dateField] = v elif not isinstance(v, basestring): Log.error("situation not handled") elif DATE_PATTERN_STRICT.match(v): # Convert to "2012/01/01 00:00:00.000" # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f")) elif DATE_PATTERN_STRICT_SHORT.match(v): # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp. # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S")) elif DATE_PATTERN_RELAXED.match(v): # Convert "2012/01/01 00:00:00.000" to "2012-01-01" # Example: bug 643420 (deadline) # bug 726635 (cf_due_date) bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v[0:10], "%Y-%m-%d")) except Exception, e: Log.error("problem with converting date to milli (value={{value}})", {"value":bug[dateField]}, e)
def test_private_attachments_not_leaking(self): for min_id, max_id in self.blocks_of_bugs(): # FIND ALL PRIVATE ATTACHMENTS bugs_w_private_attachments = get( self.private, { "and": [ { "range": { "bug_id": { "gte": min_id, "lt": max_id } } }, { "range": { "expires_on": { "gte": NOW } } }, #CURRENT RECORDS { "range": { "modified_ts": { "lt": A_WHILE_AGO } } }, #OF A MINIMUM AGE { "nested": { #HAS ATTACHMENT. "path": "attachments", "query": { "filtered": { "query": { "match_all": {} }, "filter": { "exists": { "field": "attachments.attach_id" } } } } } }, { "or": [ { "nested": { #PRIVATE ATTACHMENT, OR... "path": "attachments", "query": { "filtered": { "query": { "match_all": {} }, "filter": { "term": { "attachments.isprivate": 1 } } } } } }, { "exists": { "field": "bug_group" } } # ...PRIVATE BUG ] } ] }, fields=["bug_id", "bug_group", "attachments", "modified_ts"]) private_attachments = Q.run({ "from": bugs_w_private_attachments, "select": "attachments.attach_id", "where": { "or": [{ "exists": "bug_group" }, { "terms": { "attachments.isprivate": ['1', True, 1] } }] } }) try: private_attachments = [int(v) for v in private_attachments] except Exception, e: private_attachments = Q.run({ "from": bugs_w_private_attachments, "select": "attachments.attach_id", "where": { "or": [{ "exists": "bug_group" }, { "terms": { "attachments.isprivate": ['1', True, 1] } }] } }) Log.note("Ensure {{num}} attachments did not leak", {"num": len(private_attachments)}) #VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, { "and": [ { "range": { "bug_id": { "gte": min_id, "lt": max_id } } }, { "range": { "expires_on": { "gte": NOW } } }, # CURRENT BUGS { "nested": { "path": "attachments", "query": { "filtered": { "query": { "match_all": {} }, "filter": { "terms": { "attach_id": private_attachments } } } } } } ] } # fields=["bug_id", "attachments"] ) # if leaked_bugs: if self.settings.param.delete: self.public.delete_record( {"terms": { "bug_id": leaked_bugs.bug_id }}) Log.note("{{num}} bugs with private attachments have leaked!", {"num": len(leaked_bugs)}) for b in leaked_bugs: Log.note( "{{bug_id}} has private_attachment\n{{version|indent}}", { "bug_id": b.bug_id, "version": b }) Log.error("Attachments have leaked!")
def test_changes_to_private_bugs_still_have_bug_group(self): self.settings.param.allow_private_bugs = True File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs}) database.make_test_instance(self.settings.bugzilla) #MARK SOME BUGS PRIVATE with DB(self.settings.bugzilla) as db: for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) # MAKE A CHANGE TO THE PRIVATE BUGS with DB(self.settings.bugzilla) as db: for b in private_bugs: old_bug = db.query( "SELECT * FROM bugs WHERE bug_id={{bug_id}}", {"bug_id": b})[0] new_bug = old_bug.copy() new_bug.bug_status = "NEW STATUS" diff(db, "bugs", old_bug, new_bug) #RUN INCREMENTAL bz_etl.main(self.settings, es, es_c) #VERIFY BUG GROUP STILL EXISTS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING now = datetime.utcnow() results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "terms": { "bug_id": private_bugs } }, { "range": { "expires_on": { "gte": CNV.datetime2milli(now) } } }] } } }, "from": 0, "size": 200000, "sort": [] }) latest_bugs = Q.select(results.hits.hits, "_source") latest_bugs_index = Q.unique_index( latest_bugs, "bug_id") # IF NOT UNIQUE, THEN ETL IS WRONG for bug_id in private_bugs: if latest_bugs_index[bug_id] == None: Log.error("Expecting to find the private bug {{bug_id}}", {"bug_id": bug_id}) bug_group = latest_bugs_index[bug_id].bug_group if not bug_group: Log.error( "Expecting private bug ({{bug_id}}) to have a bug group", {"bug_id": bug_id}) if BUG_GROUP_FOR_TESTING not in bug_group: Log.error( "Expecting private bug ({{bug_id}}) to have a \"{{bug_group}}\" bug group", { "bug_id": bug_id, "bug_group": BUG_GROUP_FOR_TESTING })
def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue): with Thread.run("alias_analysis", alias_analysis.main, settings=settings): end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) start = nvl(settings.param.start, 0) if resume_from_last_run: start = nvl(settings.param.start, Math.floor(get_max_bug_id(es), settings.param.increment)) ############################################################# ## MAIN ETL LOOP ############################################################# #TWO WORKERS IS MORE THAN ENOUGH FOR A SINGLE THREAD # with Multithread([run_both_etl, run_both_etl]) as workers: for min, max in Q.intervals(start, end, settings.param.increment): if settings.args.quick and min < end - settings.param.increment and min != 0: #--quick ONLY DOES FIRST AND LAST BLOCKS continue try: #GET LIST OF CHANGED BUGS with Timer("time to get {{min}}..{{max}} bug list", {"min":min, "max":max}): if param.allow_private_bugs: bug_list = Q.select(db.query(""" SELECT b.bug_id FROM bugs b WHERE delta_ts >= {{start_time_str}} AND ({{min}} <= b.bug_id AND b.bug_id < {{max}}) """, { "min": min, "max": max, "start_time_str": param.start_time_str }), u"bug_id") else: bug_list = Q.select(db.query(""" SELECT b.bug_id FROM bugs b LEFT JOIN bug_group_map m ON m.bug_id=b.bug_id WHERE delta_ts >= {{start_time_str}} AND ({{min}} <= b.bug_id AND b.bug_id < {{max}}) AND m.bug_id IS NULL """, { "min": min, "max": max, "start_time_str": param.start_time_str }), u"bug_id") if not bug_list: continue param.bug_list = bug_list run_both_etl(**{ "db": db, "output_queue": output_queue, "es_comments": es_comments, "param": param.copy() }) except Exception, e: Log.error("Problem with dispatch loop in range [{{min}}, {{max}})", { "min": min, "max": max }, e)
refresh_param = param.copy() refresh_param.bug_list = bug_list refresh_param.start_time = 0 refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0) try: etl(db, output_queue, refresh_param.copy(), please_stop=None) etl_comments(db, es_comments, refresh_param.copy(), please_stop=None) except Exception, e: Log.error("Problem with etl using parameters {{parameters}}", { "parameters": refresh_param }, e) #REFRESH COMMENTS WITH PRIVACY CHANGE private_comments = get_recent_private_comments(db, param) comment_list = set(Q.select(private_comments, "comment_id")) | {0} es_comments.delete_record({"terms": {"comment_id": comment_list}}) changed_comments = get_comments_by_id(db, comment_list, param) es_comments.extend({"id": c.comment_id, "value": c} for c in changed_comments) #GET LIST OF CHANGED BUGS with Timer("time to get changed bug list"): if param.allow_private_bugs: bug_list = Q.select(db.query(""" SELECT b.bug_id FROM bugs b WHERE delta_ts >= {{start_time_str}} """, {
def test_private_bugs_not_leaking(self): bad_news = False # FOR ALL BUG BLOCKS for min_id, max_id in self.blocks_of_bugs(): results = get( self.private, { "and": [ { "match_all": {} }, { "and": [ { "range": { "bug_id": { "gte": min_id, "lt": max_id } } }, { "exists": { "field": "bug_group" } }, { "range": { "expires_on": { "gte": NOW } } }, #CURRENT RECORDS { "range": { "modified_ts": { "lt": A_WHILE_AGO } } }, #OF A MINIMUM AGE ] } ] }, ["bug_id", "bug_group", "modified_ts"]) private_ids = {b.bug_id: b.bug_group for b in results} Log.note("Ensure {{num}} bugs did not leak", {"num": len(private_ids.keys())}) # VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, { "and": [ { "terms": { "bug_id": private_ids.keys() } }, { "range": { "expires_on": { "gte": NOW } } } # SOME BUGS WILL LEAK FOR A LITTLE WHILE ] }) if leaked_bugs: bad_news = True if self.settings.param.delete: self.public.delete_record( {"terms": { "bug_id": leaked_bugs.bug_id }}) Log.note( "{{num}} leaks!! {{bugs}}", { "num": len(leaked_bugs), "bugs": Q.run({ "from": leaked_bugs, "select": [ "bug_id", "bug_version_num", { "name": "modified_ts", "value": lambda d: CNV.datetime2string( CNV.milli2datetime(d.modified_ts)) } ], "sort": "bug_id" }) }) for b in leaked_bugs: Log.note( "{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}", { "bug_id": b.bug_id, "bug_group": private_ids[b.bug_id], "version": milli2datetime(b) }) #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG leaked_comments = get(self.public_comments, {"terms": { "bug_id": private_ids.keys() }}, limit=20) if leaked_comments: bad_news = True if self.settings.param.delete: self.public_comments.delete_record( {"terms": { "bug_id": leaked_comments.bug_id }}) Log.warning( "{{num}} comments marked private have leaked!\n{{comments|indent}}", { "num": len(leaked_comments), "comments": leaked_comments }) if bad_news: Log.error("Bugs have leaked!")
def get_screened_whiteboard(db): if not SCREENED_BUG_GROUP_IDS: groups = db.query("SELECT id FROM groups WHERE {{where}}", { "where": esfilter2sqlwhere(db, {"terms": {"name": SCREENED_WHITEBOARD_BUG_GROUPS}}) }) globals()["SCREENED_BUG_GROUP_IDS"] = Q.select(groups, "id")
del bug["everconfirmed"] if bug.id == "692436_1336314345": bug.votes = 3 try: if Math.is_number(bug.cf_last_resolved): bug.cf_last_resolved = long(bug.cf_last_resolved) else: bug.cf_last_resolved = CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S")) except Exception, e: pass bug = transform_bugzilla.rename_attachments(bug) for c in bug.changes: c.field_name = c.field_name.replace("attachments.", "attachments_") if c.attach_id == '': c.attach_id = None else: c.attach_id = CNV.value2int(c.attach_id) bug.attachments = Q.sort(bug.attachments, "attach_id") for a in bug.attachments: a.attach_id = CNV.value2int(a.attach_id) for k, v in list(a.items()): if k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate"): struct.unwrap(a)[k] = CNV.value2int(v) # PREVENT dot (.) INTERPRETATION a[k.split(".")[-1].split("_")[-1]] = CNV.value2int(v) bug = transform_bugzilla.normalize(bug) return bug
def get_bugs(db, param): try: get_bugs_table_columns(db, db.settings.schema) get_screened_whiteboard(db) #TODO: CF_LAST_RESOLVED IS IN PDT, FIX IT def lower(col): if col.column_type.startswith("varchar"): return "lower(" + db.quote_column(col.column_name) + ") " + db.quote_column(col.column_name) else: return db.quote_column(col.column_name) param.bugs_columns = Q.select(bugs_columns, "column_name") param.bugs_columns_SQL = SQL(",\n".join([lower(c) for c in bugs_columns])) param.bug_filter = esfilter2sqlwhere(db, {"terms": {"b.bug_id": param.bug_list}}) param.screened_whiteboard = esfilter2sqlwhere(db, {"and": [ {"exists": "m.bug_id"}, {"terms": {"m.group_id": SCREENED_BUG_GROUP_IDS}} ]}) if param.allow_private_bugs: param.sensitive_columns = SQL(""" '[screened]' short_desc, '[screened]' bug_file_loc """) else: param.sensitive_columns = SQL(""" short_desc, bug_file_loc """) bugs = db.query(""" SELECT b.bug_id, UNIX_TIMESTAMP(CONVERT_TZ(b.creation_ts, 'US/Pacific','UTC'))*1000 AS modified_ts, lower(pr.login_name) AS modified_by, UNIX_TIMESTAMP(CONVERT_TZ(b.creation_ts, 'US/Pacific','UTC'))*1000 AS created_ts, lower(pr.login_name) AS created_by, lower(pa.login_name) AS assigned_to, lower(pq.login_name) AS qa_contact, lower(prod.`name`) AS product, lower(comp.`name`) AS component, CASE WHEN {{screened_whiteboard}} AND b.status_whiteboard IS NOT NULL AND trim(b.status_whiteboard)<>'' THEN '[screened]' ELSE trim(lower(b.status_whiteboard)) END status_whiteboard, {{sensitive_columns}}, {{bugs_columns_SQL}} FROM bugs b LEFT JOIN profiles pr ON b.reporter = pr.userid LEFT JOIN profiles pa ON b.assigned_to = pa.userid LEFT JOIN profiles pq ON b.qa_contact = pq.userid LEFT JOIN products prod ON prod.id = product_id LEFT JOIN components comp ON comp.id = component_id LEFT JOIN bug_group_map m ON m.bug_id = b.bug_id WHERE {{bug_filter}} """, param) #bugs IS LIST OF BUGS WHICH MUST BE CONVERTED TO THE DELTA RECORDS FOR ALL FIELDS output = [] for r in bugs: flatten_bugs_record(r, output) return output except Exception, e: Log.error("can not get basic bug data", e)
def search(self, query): query=wrap(query) f = CNV.esfilter2where(query.query.filtered.filter) filtered=wrap([{"_id": i, "_source": d} for i, d in self.data.items() if f(d)]) if query.fields: return wrap({"hits": {"total":len(filtered), "hits": [{"_id":d._id, "fields":unwrap(Q.select([unwrap(d._source)], query.fields)[0])} for d in filtered]}}) else: return wrap({"hits": {"total":len(filtered), "hits": filtered}})
def normalize(bug, old_school=False): bug = bug.copy() bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3] bug._id = None #ENSURE STRUCTURES ARE SORTED # Do some processing to make sure that diffing between runs stays as similar as possible. bug.flags = Q.sort(bug.flags, "value") if bug.attachments: if USE_ATTACHMENTS_DOT: bug.attachments = CNV.JSON2object( CNV.object2JSON(bug.attachments).replace( "attachments_", "attachments.")) bug.attachments = Q.sort(bug.attachments, "attach_id") for a in bug.attachments: for k, v in list(a.items()): if k.startswith("attachments") and (k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate")): new_v = CNV.value2int(v) new_k = k[12:] a[k.replace(".", "\.")] = new_v if not old_school: a[new_k] = new_v a.flags = Q.sort(a.flags, ["modified_ts", "value"]) if bug.changes != None: if USE_ATTACHMENTS_DOT: json = CNV.object2JSON(bug.changes).replace( "attachments_", "attachments.") bug.changes = CNV.JSON2object(json) bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"]) #bug IS CONVERTED TO A 'CLEAN' COPY bug = ElasticSearch.scrub(bug) # bug.attachments = nvl(bug.attachments, []) # ATTACHMENTS MUST EXIST for f in NUMERIC_FIELDS: v = bug[f] if v == None: continue elif f in MULTI_FIELDS: bug[f] = CNV.value2intlist(v) elif CNV.value2number(v) == 0: del bug[f] else: bug[f] = CNV.value2number(v) # Also reformat some date fields for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]: v = bug[dateField] if v == None: continue try: if isinstance(v, date): bug[dateField] = CNV.datetime2milli(v) elif isinstance(v, long) and len(unicode(v)) in [12, 13]: bug[dateField] = v elif not isinstance(v, basestring): Log.error("situation not handled") elif DATE_PATTERN_STRICT.match(v): # Convert to "2012/01/01 00:00:00.000" # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v + "000", "%Y/%m/%d %H:%M%:S%f")) elif DATE_PATTERN_STRICT_SHORT.match(v): # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp. # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S")) elif DATE_PATTERN_RELAXED.match(v): # Convert "2012/01/01 00:00:00.000" to "2012-01-01" # Example: bug 643420 (deadline) # bug 726635 (cf_due_date) bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v[0:10], "%Y-%m-%d")) except Exception, e: Log.error( "problem with converting date to milli (value={{value}})", {"value": bug[dateField]}, e)
def get_bugs(db, param): try: get_bugs_table_columns(db, db.settings.schema) get_screened_whiteboard(db) #TODO: CF_LAST_RESOLVED IS IN PDT, FIX IT def lower(col): if col.column_type.startswith("varchar"): return "lower(" + db.quote_column( col.column_name) + ") " + db.quote_column(col.column_name) else: return db.quote_column(col.column_name) param.bugs_columns = Q.select(bugs_columns, "column_name") param.bugs_columns_SQL = SQL(",\n".join( [lower(c) for c in bugs_columns])) param.bug_filter = esfilter2sqlwhere( db, {"terms": { "b.bug_id": param.bug_list }}) param.screened_whiteboard = esfilter2sqlwhere( db, { "and": [{ "exists": "m.bug_id" }, { "terms": { "m.group_id": SCREENED_BUG_GROUP_IDS } }] }) if param.allow_private_bugs: param.sensitive_columns = SQL(""" '[screened]' short_desc, '[screened]' bug_file_loc """) else: param.sensitive_columns = SQL(""" short_desc, bug_file_loc """) bugs = db.query( """ SELECT b.bug_id, UNIX_TIMESTAMP(CONVERT_TZ(b.creation_ts, 'US/Pacific','UTC'))*1000 AS modified_ts, lower(pr.login_name) AS modified_by, UNIX_TIMESTAMP(CONVERT_TZ(b.creation_ts, 'US/Pacific','UTC'))*1000 AS created_ts, lower(pr.login_name) AS created_by, lower(pa.login_name) AS assigned_to, lower(pq.login_name) AS qa_contact, lower(prod.`name`) AS product, lower(comp.`name`) AS component, CASE WHEN {{screened_whiteboard}} AND b.status_whiteboard IS NOT NULL AND trim(b.status_whiteboard)<>'' THEN '[screened]' ELSE trim(lower(b.status_whiteboard)) END status_whiteboard, {{sensitive_columns}}, {{bugs_columns_SQL}} FROM bugs b LEFT JOIN profiles pr ON b.reporter = pr.userid LEFT JOIN profiles pa ON b.assigned_to = pa.userid LEFT JOIN profiles pq ON b.qa_contact = pq.userid LEFT JOIN products prod ON prod.id = product_id LEFT JOIN components comp ON comp.id = component_id LEFT JOIN bug_group_map m ON m.bug_id = b.bug_id WHERE {{bug_filter}} """, param) #bugs IS LIST OF BUGS WHICH MUST BE CONVERTED TO THE DELTA RECORDS FOR ALL FIELDS output = [] for r in bugs: flatten_bugs_record(r, output) return output except Exception, e: Log.error("can not get basic bug data", e)
refresh_param.start_time = 0 refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0) try: etl(db, output_queue, refresh_param.copy(), please_stop=None) etl_comments(db, es_comments, refresh_param.copy(), please_stop=None) except Exception, e: Log.error("Problem with etl using parameters {{parameters}}", {"parameters": refresh_param}, e) #REFRESH COMMENTS WITH PRIVACY CHANGE private_comments = get_recent_private_comments(db, param) comment_list = set(Q.select(private_comments, "comment_id")) | {0} es_comments.delete_record({"terms": {"comment_id": comment_list}}) changed_comments = get_comments_by_id(db, comment_list, param) es_comments.extend({ "id": c.comment_id, "value": c } for c in changed_comments) #GET LIST OF CHANGED BUGS with Timer("time to get changed bug list"): if param.allow_private_bugs: bug_list = Q.select( db.query( """ SELECT b.bug_id