def setup_es(settings, db, es, es_comments): """ SETUP ES CONNECTIONS TO REFLECT IF WE ARE RESUMING, INCREMENTAL, OR STARTING OVER """ current_run_time = get_current_time(db) if File(settings.param.first_run_time).exists and File(settings.param.last_run_time).exists: # INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX last_run_time = long(File(settings.param.last_run_time).read()) if not es: es = ElasticSearch(settings.es) es_comments = ElasticSearch(settings.es_comments) elif File(settings.param.first_run_time).exists: # DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL try: last_run_time = 0 current_run_time = long(File(settings.param.first_run_time).read()) if not es: if not settings.es.alias: temp = ElasticSearch(settings.es).get_proto(settings.es.index) settings.es.alias = settings.es.index settings.es.index = temp.last() es = ElasticSearch(settings.es) es.set_refresh_interval(1) #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY if not settings.es_comments.alias: temp = ElasticSearch(settings.es_comments).get_proto(settings.es_comments.index) settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = temp.last() es_comments = ElasticSearch(settings.es_comments) except Exception, e: Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments)
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [ b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs ] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs": some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning( "Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def loadAliases(settings): try: try: with Timer("load alias file at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}): alias_json = File(settings.param.alias_file).read() except Exception, e: Log.warning("No alias file found (looking at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}) alias_json = "{}" #self.aliases IS A dict POINTING TO structs for k, v in CNV.JSON2object(alias_json).iteritems(): aliases[k] = struct.wrap(v) Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs":some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning("Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def extract_from_file(source_settings, destination): with File(source_settings.filename) as handle: for g, d in Q.groupby(handle, size=BATCH_SIZE): try: d2 = map( lambda (x): {"id": x.id, "value": x}, map( lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))), d ) ) destination.add(d2) except Exception, e: filename = "Error_" + unicode(g) + ".txt" File(filename).write(d) Log.warning("Can not convert block {{block}} (file={{host}})", { "block": g, "filename": filename }, e)
def compare_both(candidate, reference, settings, some_bugs): File(settings.param.errors).delete() try_dir = settings.param.errors + "/try/" ref_dir = settings.param.errors + "/ref/" with Timer("Comparing to reference"): found_errors = False for bug_id in some_bugs: try: versions = Q.sort( get_all_bug_versions(candidate, bug_id, datetime.utcnow()), "modified_ts") # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE if not versions: max_time = CNV.milli2datetime(settings.bugzilla.expires_on) else: max_time = CNV.milli2datetime(versions.last().modified_ts) pre_ref_versions = get_all_bug_versions( reference, bug_id, max_time) ref_versions = \ Q.sort( #ADDED TO FIX OLD PRODUCTION BUG VERSIONS [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions], "modified_ts" ) can = CNV.object2JSON(versions, pretty=True) ref = CNV.object2JSON(ref_versions, pretty=True) if can != ref: found_errors = True File(try_dir + unicode(bug_id) + ".txt").write(can) File(ref_dir + unicode(bug_id) + ".txt").write(ref) except Exception, e: found_errors = True Log.warning("Problem ETL'ing bug {{bug_id}}", {"bug_id": bug_id}, e) if found_errors: Log.error("DIFFERENCES FOUND (Differences shown in {{path}})", {"path": [try_dir, ref_dir]})
def compare_both(candidate, reference, settings, some_bugs): File(settings.param.errors).delete() try_dir = settings.param.errors + "/try/" ref_dir = settings.param.errors + "/ref/" with Timer("Comparing to reference"): found_errors = False for bug_id in some_bugs: try: versions = Q.sort( get_all_bug_versions(candidate, bug_id, datetime.utcnow()), "modified_ts") # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE if not versions: max_time = CNV.milli2datetime(settings.bugzilla.expires_on) else: max_time = CNV.milli2datetime(versions.last().modified_ts) pre_ref_versions = get_all_bug_versions(reference, bug_id, max_time) ref_versions = \ Q.sort( #ADDED TO FIX OLD PRODUCTION BUG VERSIONS [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions], "modified_ts" ) can = CNV.object2JSON(versions, pretty=True) ref = CNV.object2JSON(ref_versions, pretty=True) if can != ref: found_errors = True File(try_dir + unicode(bug_id) + ".txt").write(can) File(ref_dir + unicode(bug_id) + ".txt").write(ref) except Exception, e: found_errors = True Log.warning("Problem ETL'ing bug {{bug_id}}", {"bug_id": bug_id}, e) if found_errors: Log.error("DIFFERENCES FOUND (Differences shown in {{path}})", { "path": [try_dir, ref_dir]} )
def setup_es(settings, db, es, es_comments): """ SETUP ES CONNECTIONS TO REFLECT IF WE ARE RESUMING, INCREMENTAL, OR STARTING OVER """ current_run_time = get_current_time(db) if File(settings.param.first_run_time).exists and File( settings.param.last_run_time).exists: # INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX last_run_time = long(File(settings.param.last_run_time).read()) if not es: es = ElasticSearch(settings.es) es_comments = ElasticSearch(settings.es_comments) elif File(settings.param.first_run_time).exists: # DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL try: last_run_time = 0 current_run_time = long(File(settings.param.first_run_time).read()) if not es: if not settings.es.alias: temp = ElasticSearch(settings.es).get_proto( settings.es.index) settings.es.alias = settings.es.index settings.es.index = temp.last() es = ElasticSearch(settings.es) es.set_refresh_interval( 1 ) #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY if not settings.es_comments.alias: temp = ElasticSearch(settings.es_comments).get_proto( settings.es_comments.index) settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = temp.last() es_comments = ElasticSearch(settings.es_comments) except Exception, e: Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments)
def milli2datetime(r): """ CONVERT ANY longs INTO TIME STRINGS """ try: if r == None: return None elif isinstance(r, basestring): return r elif Math.is_number(r): if CNV.value2number(r) > 800000000000: return CNV.datetime2string(CNV.milli2datetime(r), "%Y-%m-%d %H:%M:%S") else: return r elif isinstance(r, dict): output = {} for k, v in r.items(): v = milli2datetime(v) if v != None: output[k.lower()] = v return output elif hasattr(r, '__iter__'): output = [] for v in r: v = milli2datetime(v) if v != None: output.append(v) if not output: return None try: return Q.sort(output) except Exception: return output else: return r except Exception, e: Log.warning("Can not scrub: {{json}}", {"json": r}, e)
def loadAliases(settings): try: try: with Timer( "load alias file at {{filename}}", { "filename": nvl(settings.param.alias_file.path, settings.param.alias_file) }): alias_json = File(settings.param.alias_file).read() except Exception, e: Log.warning( "No alias file found (looking at {{filename}}", { "filename": nvl(settings.param.alias_file.path, settings.param.alias_file) }) alias_json = "{}" #self.aliases IS A dict POINTING TO structs for k, v in CNV.JSON2object(alias_json).iteritems(): aliases[k] = struct.wrap(v) Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def test_private_bugs_not_leaking(self): bad_news = False # FOR ALL BUG BLOCKS for min_id, max_id in self.blocks_of_bugs(): results = get( self.private, {"and": [ {"match_all": {}}, {"and": [ {"range": {"bug_id": {"gte": min_id, "lt": max_id}}}, {"exists": {"field": "bug_group"}}, {"range": {"expires_on": {"gte": NOW}}}, #CURRENT RECORDS {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE ]} ]}, ["bug_id", "bug_group", "modified_ts"] ) private_ids = {b.bug_id: b.bug_group for b in results} Log.note("Ensure {{num}} bugs did not leak", { "num": len(private_ids.keys()) }) # VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, {"and": [ {"terms": {"bug_id": private_ids.keys()}}, {"range": {"expires_on": {"gte": NOW}}} # SOME BUGS WILL LEAK FOR A LITTLE WHILE ]} ) if leaked_bugs: bad_news = True if self.settings.param.delete: self.public.delete_record( {"terms":{"bug_id":leaked_bugs.bug_id}} ) Log.note("{{num}} leaks!! {{bugs}}", { "num": len(leaked_bugs), "bugs": Q.run({ "from":leaked_bugs, "select":["bug_id", "bug_version_num", {"name":"modified_ts", "value":lambda d: CNV.datetime2string(CNV.milli2datetime(d.modified_ts))}], "sort":"bug_id" }) }) for b in leaked_bugs: Log.note("{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}", { "bug_id": b.bug_id, "bug_group": private_ids[b.bug_id], "version": milli2datetime(b) }) #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG leaked_comments = get( self.public_comments, {"terms": {"bug_id": private_ids.keys()}}, limit=20 ) if leaked_comments: bad_news = True if self.settings.param.delete: self.public_comments.delete_record( {"terms":{"bug_id":leaked_comments.bug_id}} ) Log.warning("{{num}} comments marked private have leaked!\n{{comments|indent}}", { "num": len(leaked_comments), "comments": leaked_comments }) if bad_news: Log.error("Bugs have leaked!")
def test_private_bugs_not_leaking(self): bad_news = False # FOR ALL BUG BLOCKS for min_id, max_id in self.blocks_of_bugs(): results = get( self.private, { "and": [ { "match_all": {} }, { "and": [ { "range": { "bug_id": { "gte": min_id, "lt": max_id } } }, { "exists": { "field": "bug_group" } }, { "range": { "expires_on": { "gte": NOW } } }, #CURRENT RECORDS { "range": { "modified_ts": { "lt": A_WHILE_AGO } } }, #OF A MINIMUM AGE ] } ] }, ["bug_id", "bug_group", "modified_ts"]) private_ids = {b.bug_id: b.bug_group for b in results} Log.note("Ensure {{num}} bugs did not leak", {"num": len(private_ids.keys())}) # VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, { "and": [ { "terms": { "bug_id": private_ids.keys() } }, { "range": { "expires_on": { "gte": NOW } } } # SOME BUGS WILL LEAK FOR A LITTLE WHILE ] }) if leaked_bugs: bad_news = True if self.settings.param.delete: self.public.delete_record( {"terms": { "bug_id": leaked_bugs.bug_id }}) Log.note( "{{num}} leaks!! {{bugs}}", { "num": len(leaked_bugs), "bugs": Q.run({ "from": leaked_bugs, "select": [ "bug_id", "bug_version_num", { "name": "modified_ts", "value": lambda d: CNV.datetime2string( CNV.milli2datetime(d.modified_ts)) } ], "sort": "bug_id" }) }) for b in leaked_bugs: Log.note( "{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}", { "bug_id": b.bug_id, "bug_group": private_ids[b.bug_id], "version": milli2datetime(b) }) #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG leaked_comments = get(self.public_comments, {"terms": { "bug_id": private_ids.keys() }}, limit=20) if leaked_comments: bad_news = True if self.settings.param.delete: self.public_comments.delete_record( {"terms": { "bug_id": leaked_comments.bug_id }}) Log.warning( "{{num}} comments marked private have leaked!\n{{comments|indent}}", { "num": len(leaked_comments), "comments": leaked_comments }) if bad_news: Log.error("Bugs have leaked!")