def test_private_etl(self): """ ENSURE IDENTIFIABLE INFORMATION DOES NOT EXIST ON ANY BUGS """ File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() self.settings.param.allow_private_bugs = True database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.fake.bugs) es_comments = elasticsearch.make_test_instance( "candidate_comments", self.settings.fake.comments) bz_etl.main(self.settings, es, es_comments) ref = elasticsearch.open_test_instance( "reference", self.settings.private_bugs_reference) compare_both(es, ref, self.settings, self.settings.param.bugs) #DIRECT COMPARE THE FILE JSON can = File(self.settings.fake.comments.filename).read() ref = File(self.settings.private_comments_reference.filename).read() if can != ref: for i, c in enumerate(can): found = -1 if can[i] != ref[i]: found = i break Log.error("Comments do not match reference\n{{sample}}", {"sample": can[MIN([0, found - 100]):found + 100]})
def test_public_etl(self): """ ENSURE ETL GENERATES WHAT'S IN THE REFERENCE FILE """ File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() self.settings.param.allow_private_bugs = Null database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.fake.bugs) es_comments = elasticsearch.make_test_instance( "candidate_comments", self.settings.fake.comments) bz_etl.main(self.settings, es, es_comments) ref = elasticsearch.open_test_instance( "reference", self.settings.public_bugs_reference) compare_both(es, ref, self.settings, self.settings.param.bugs) #DIRECT COMPARE THE FILE JSON can = File(self.settings.fake.comments.filename).read() ref = File(self.settings.public_comments_reference.filename).read() if can != ref: found = -1 for i, c in enumerate(can): if can[i] != ref[i]: found = i break Log.error("Comments do not match reference\n{{sample}}", {"sample": can[MIN(0, found - 100):found + 100:]})
def main(settings): file = File(settings.param.alias_file) aliases = CNV.JSON2object(file.read()) for v in aliases.values(): v.candidates = CNV.dict2Multiset(v.candidates) data = [ { "lost": n, "found": d.canonical } for n, d in aliases.items() if d.canonical != None and n != d.canonical ] sorted = Q.sort(data, "found") for s in sorted: Log.note("{{found}} == {{lost}}", s) clean = { n: d.canonical for n, d in aliases.items() if d.canonical != None and n != d.canonical and n != "" } rev_clean = struct.inverse(clean) Log.note(CNV.object2JSON(rev_clean, pretty=True)) for k, v in rev_clean.items(): if len(v) > 3: Log.note(CNV.object2JSON({k: v}, pretty=True))
def main(settings): file = File(settings.param.alias_file) aliases = CNV.JSON2object(file.read()) for v in aliases.values(): v.candidates = CNV.dict2Multiset(v.candidates) data = [{ "lost": n, "found": d.canonical } for n, d in aliases.items() if d.canonical != None and n != d.canonical] sorted = Q.sort(data, "found") for s in sorted: Log.note("{{found}} == {{lost}}", s) clean = { n: d.canonical for n, d in aliases.items() if d.canonical != None and n != d.canonical and n != "" } rev_clean = struct.inverse(clean) Log.note(CNV.object2JSON(rev_clean, pretty=True)) for k, v in rev_clean.items(): if len(v) > 3: Log.note(CNV.object2JSON({k: v}, pretty=True))
def start(): try: settings = startup.read_settings(defs=[{ "name": ["--quick", "--fast"], "help": "use this to process the first and last block, useful for testing the config settings before doing a full run", "action": "store_true", "dest": "quick" }, { "name": ["--restart", "--reset", "--redo"], "help": "use this to force a reprocessing of all data", "action": "store_true", "dest": "restart" }]) with startup.SingleInstance(flavor_id=settings.args.filename): if settings.args.restart: for l in struct.listwrap(settings.debug.log): if l.filename: File(l.filename).parent.delete() File(settings.param.first_run_time).delete() File(settings.param.last_run_time).delete() Log.start(settings.debug) main(settings) except Exception, e: Log.fatal("Can not start", e)
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [ b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs ] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs": some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning( "Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def __init__(self, file): assert file from ..env.files import File self.file = File(file) if self.file.exists: self.file.backup() self.file.delete() self.file_lock = threads.Lock()
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File( settings.param.first_run_time).exists and not File( settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es( settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl( settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write( unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def __init__(self, settings): self.settings = wrap({"host":"fake", "index":"fake"}) self.filename = settings.filename try: self.data = CNV.JSON2object(File(self.filename).read()) except IOError: self.data = Struct()
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs":some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning("Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
class Log_usingFile(BaseLog): def __init__(self, file): assert file from ..env.files import File self.file = File(file) if self.file.exists: self.file.backup() self.file.delete() self.file_lock = threads.Lock() def write(self, template, params): from ..env.files import File with self.file_lock: File(self.filename).append(expand_template(template, params))
def extract_from_file(source_settings, destination): with File(source_settings.filename) as handle: for g, d in Q.groupby(handle, size=BATCH_SIZE): try: d2 = map( lambda (x): {"id": x.id, "value": x}, map( lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))), d ) ) destination.add(d2) except Exception, e: filename = "Error_" + unicode(g) + ".txt" File(filename).write(d) Log.warning("Can not convert block {{block}} (file={{host}})", { "block": g, "filename": filename }, e)
def saveAliases(settings): compressed = { email: details for email, details in aliases.iteritems() if details.canonical } #COMPARE WITH PREVIOUS ALIAS VERSION try: old_alias_json = File(settings.param.alias_file).read() except Exception, e: old_alias_json = "{}"
def compare_both(candidate, reference, settings, some_bugs): File(settings.param.errors).delete() try_dir = settings.param.errors + "/try/" ref_dir = settings.param.errors + "/ref/" with Timer("Comparing to reference"): found_errors = False for bug_id in some_bugs: try: versions = Q.sort( get_all_bug_versions(candidate, bug_id, datetime.utcnow()), "modified_ts") # WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE if not versions: max_time = CNV.milli2datetime(settings.bugzilla.expires_on) else: max_time = CNV.milli2datetime(versions.last().modified_ts) pre_ref_versions = get_all_bug_versions( reference, bug_id, max_time) ref_versions = \ Q.sort( #ADDED TO FIX OLD PRODUCTION BUG VERSIONS [compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions], "modified_ts" ) can = CNV.object2JSON(versions, pretty=True) ref = CNV.object2JSON(ref_versions, pretty=True) if can != ref: found_errors = True File(try_dir + unicode(bug_id) + ".txt").write(can) File(ref_dir + unicode(bug_id) + ".txt").write(ref) except Exception, e: found_errors = True Log.warning("Problem ETL'ing bug {{bug_id}}", {"bug_id": bug_id}, e) if found_errors: Log.error("DIFFERENCES FOUND (Differences shown in {{path}})", {"path": [try_dir, ref_dir]})
def extend(self, records): """ JUST SO WE MODEL A Queue """ records = {v["id"]: v["value"] for v in records} struct.unwrap(self.data).update(records) data_as_json = CNV.object2JSON(self.data, pretty=True) File(self.filename).write(data_as_json) Log.note("{{num}} items added", {"num": len(records)})
def test_private_bugs_do_not_show(self): self.settings.param.allow_private_bugs = False File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs}) database.make_test_instance(self.settings.bugzilla) #MARK SOME BUGS PRIVATE with DB(self.settings.bugzilla) as db: for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING verify_no_private_bugs(es, private_bugs)
def open_test_instance(name, settings): if settings.filename: Log.note("Using {{filename}} as {{type}}", { "filename": settings.filename, "type": name }) return Fake_ES(settings) else: Log.note("Using ES cluster at {{host}} as {{type}}", { "host": settings.host, "type": name }) ElasticSearch.delete_index(settings) schema = CNV.JSON2object(File(settings.schema_file).read(), flexible=True, paths=True) es = ElasticSearch.create_index(settings, schema, limit_replicas=True) return es
def stop(cls): if cls.profiler: from bzETL.util.cnv import CNV from bzETL.util.env.files import File p = pstats.Stats(cls.profiler) stats = [{ "num_calls":d[1], "self_time":d[2], "total_time":d[3], "file":(f[0] if f[0] != "~" else "").replace("\\", "/"), "line":f[1], "method":f[2].lstrip("<").rstrip(">") } for f, d, in p.stats.iteritems() ] CNV.list2tab(stats) File("profile.tab").write(CNV.list2tab(stats)) cls.main_log.stop()
def loadAliases(settings): try: try: with Timer( "load alias file at {{filename}}", { "filename": nvl(settings.param.alias_file.path, settings.param.alias_file) }): alias_json = File(settings.param.alias_file).read() except Exception, e: Log.warning( "No alias file found (looking at {{filename}}", { "filename": nvl(settings.param.alias_file.path, settings.param.alias_file) }) alias_json = "{}" #self.aliases IS A dict POINTING TO structs for k, v in CNV.JSON2object(alias_json).iteritems(): aliases[k] = struct.wrap(v) Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def setup_es(settings, db, es, es_comments): """ SETUP ES CONNECTIONS TO REFLECT IF WE ARE RESUMING, INCREMENTAL, OR STARTING OVER """ current_run_time = get_current_time(db) if File(settings.param.first_run_time).exists and File( settings.param.last_run_time).exists: # INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX last_run_time = long(File(settings.param.last_run_time).read()) if not es: es = ElasticSearch(settings.es) es_comments = ElasticSearch(settings.es_comments) elif File(settings.param.first_run_time).exists: # DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL try: last_run_time = 0 current_run_time = long(File(settings.param.first_run_time).read()) if not es: if not settings.es.alias: temp = ElasticSearch(settings.es).get_proto( settings.es.index) settings.es.alias = settings.es.index settings.es.index = temp.last() es = ElasticSearch(settings.es) es.set_refresh_interval( 1 ) #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY if not settings.es_comments.alias: temp = ElasticSearch(settings.es_comments).get_proto( settings.es_comments.index) settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = temp.last() es_comments = ElasticSearch(settings.es_comments) except Exception, e: Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments)
def main(settings): #USE A FILE if settings.source.filename != None: settings.destination.alias = settings.destination.index settings.destination.index = ElasticSearch.proto_name(settings.destination.alias) schema = CNV.JSON2object(File(settings.source.schema_filename).read()) if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = CNV.JSON2object(CNV.object2JSON(schema).replace("attachments_", "attachments.")) dest = ElasticSearch.create_index(settings.destination, schema, limit_replicas=True) dest.set_refresh_interval(-1) extract_from_file(settings.source, dest) dest.set_refresh_interval(1) dest.delete_all_but(settings.destination.alias, settings.destination.index) dest.add_alias(settings.destination.alias) return # SYNCH WITH source ES INDEX source=ElasticSearch(settings.source) destination=get_or_create_index(settings["destination"], source) # GET LAST UPDATED time_file = File(settings.param.last_replication_time) from_file = None if time_file.exists: from_file = CNV.milli2datetime(CNV.value2int(time_file.read())) from_es = get_last_updated(destination) last_updated = nvl(MIN(from_file, from_es), CNV.milli2datetime(0)) current_time = datetime.utcnow() pending = get_pending(source, last_updated) with ThreadedQueue(destination, size=1000) as data_sink: replicate(source, data_sink, pending, last_updated) # RECORD LAST UPDATED time_file.write(unicode(CNV.datetime2milli(current_time)))
def test_recent_private_stuff_does_not_show(self): self.settings.param.allow_private_bugs = False File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) #MARK SOME STUFF PRIVATE with DB(self.settings.bugzilla) as db: #BUGS private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs are {{bugs}}", {"bugs": private_bugs}) for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) #COMMENTS comments = db.query("SELECT comment_id FROM longdescs").comment_id marked_private_comments = Random.sample(comments, 5) for c in marked_private_comments: database.mark_comment_private(db, c, isprivate=1) #INCLUDE COMMENTS OF THE PRIVATE BUGS implied_private_comments = db.query( """ SELECT comment_id FROM longdescs WHERE {{where}} """, { "where": esfilter2sqlwhere(db, {"terms": { "bug_id": private_bugs }}) }).comment_id private_comments = marked_private_comments + implied_private_comments Log.note("The private comments are {{comments}}", {"comments": private_comments}) #ATTACHMENTS attachments = db.query("SELECT bug_id, attach_id FROM attachments") private_attachments = Random.sample(attachments, 5) Log.note("The private attachments are {{attachments}}", {"attachments": private_attachments}) for a in private_attachments: database.mark_attachment_private(db, a.attach_id, isprivate=1) if not File(self.settings.param.last_run_time).exists: Log.error("last_run_time should exist") bz_etl.main(self.settings, es, es_c) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING verify_no_private_bugs(es, private_bugs) verify_no_private_attachments(es, private_attachments) verify_no_private_comments(es_c, private_comments) #MARK SOME STUFF PUBLIC with DB(self.settings.bugzilla) as db: for b in private_bugs: database.remove_bug_group(db, b, BUG_GROUP_FOR_TESTING) bz_etl.main(self.settings, es, es_c) #VERIFY BUG IS PUBLIC, BUT PRIVATE ATTACHMENTS AND COMMENTS STILL NOT Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING verify_public_bugs(es, private_bugs) verify_no_private_attachments(es, private_attachments) verify_no_private_comments(es_c, marked_private_comments)
constructor = None try: temp = __import__(path, globals(), locals(), [class_name], -1) constructor = object.__getattribute__(temp, class_name) except Exception, e: if settings.stream and not constructor: #PROVIDE A DEFAULT STREAM HANLDER constructor = Log_usingStream else: Log.error("Can not find class {{class}}", {"class": path}, e) #IF WE NEED A FILE, MAKE SURE DIRECTORY EXISTS if settings.filename: from ..env.files import File f = File(settings.filename) if not f.parent.exists: f.parent.create() settings['class'] = None params = struct.unwrap(settings) return constructor(**params) def time_delta_pusher(please_stop, appender, queue, interval): """ appender - THE FUNCTION THAT ACCEPTS A STRING queue - FILLED WITH LINES TO WRITE interval - timedelta USE IN A THREAD TO BATCH LOGS BY TIME INTERVAL """
def make_test_instance(name, settings): if settings.filename: File(settings.filename).delete() return open_test_instance(name, settings)
if not settings.es_comments.alias: temp = ElasticSearch(settings.es_comments).get_proto(settings.es_comments.index) settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = temp.last() es_comments = ElasticSearch(settings.es_comments) except Exception, e: Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments) else: # START ETL FROM BEGINNING, MAKE NEW INDEX last_run_time = 0 if not es: # BUG VERSIONS schema = File(settings.es.schema_file).read() if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = schema.replace("attachments_", "attachments\\.") schema=CNV.JSON2object(schema, paths=True) schema.settings=jsons.expand_dot(schema.settings) if not settings.es.alias: settings.es.alias = settings.es.index settings.es.index = ElasticSearch.proto_name(settings.es.alias) es = ElasticSearch.create_index(settings.es, schema, limit_replicas=True) # BUG COMMENTS comment_schema = File(settings.es_comments.schema_file).read() comment_schema=CNV.JSON2object(comment_schema, paths=True) comment_schema.settings=jsons.expand_dot(comment_schema.settings) if not settings.es_comments.alias: settings.es_comments.alias = settings.es_comments.index
def write(self, template, params): from ..env.files import File with self.file_lock: File(self.filename).append(expand_template(template, params))
old_aliases = {} for k, v in CNV.JSON2object(old_alias_json).iteritems(): old_aliases[k] = struct.wrap(v) added = set(compressed.keys()) - set(old_aliases.keys()) removed = set(old_aliases.keys()) - set(compressed.keys()) common = set(compressed.keys()) & set(old_aliases.keys()) changed = set() for c in common: if CNV.object2JSON(compressed[c], pretty=True) != CNV.object2JSON(old_aliases[c], pretty=True): changed.add(c) if added or removed or changed: alias_json = CNV.object2JSON(compressed, pretty=True) file = File(settings.param.alias_file) file.write(alias_json) Log.note("{{num}} of {{total}} aliases saved", { "num": len(compressed.keys()), "total": len(aliases.keys()) }) def start(): try: settings = startup.read_settings() Log.start(settings.debug) main(settings, restart=True) except Exception, e: Log.error("Can not start", e)
if not settings.es_comments.alias: temp = ElasticSearch(settings.es_comments).get_proto( settings.es_comments.index) settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = temp.last() es_comments = ElasticSearch(settings.es_comments) except Exception, e: Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments) else: # START ETL FROM BEGINNING, MAKE NEW INDEX last_run_time = 0 if not es: # BUG VERSIONS schema = File(settings.es.schema_file).read() if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = schema.replace("attachments_", "attachments\\.") schema = CNV.JSON2object(schema, paths=True) schema.settings = jsons.expand_dot(schema.settings) if not settings.es.alias: settings.es.alias = settings.es.index settings.es.index = ElasticSearch.proto_name(settings.es.alias) es = ElasticSearch.create_index(settings.es, schema, limit_replicas=True) # BUG COMMENTS comment_schema = File(settings.es_comments.schema_file).read() comment_schema = CNV.JSON2object(comment_schema, paths=True) comment_schema.settings = jsons.expand_dot(comment_schema.settings)
def test_changes_to_private_bugs_still_have_bug_group(self): self.settings.param.allow_private_bugs = True File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs}) database.make_test_instance(self.settings.bugzilla) #MARK SOME BUGS PRIVATE with DB(self.settings.bugzilla) as db: for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) # MAKE A CHANGE TO THE PRIVATE BUGS with DB(self.settings.bugzilla) as db: for b in private_bugs: old_bug = db.query( "SELECT * FROM bugs WHERE bug_id={{bug_id}}", {"bug_id": b})[0] new_bug = old_bug.copy() new_bug.bug_status = "NEW STATUS" diff(db, "bugs", old_bug, new_bug) #RUN INCREMENTAL bz_etl.main(self.settings, es, es_c) #VERIFY BUG GROUP STILL EXISTS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING now = datetime.utcnow() results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "terms": { "bug_id": private_bugs } }, { "range": { "expires_on": { "gte": CNV.datetime2milli(now) } } }] } } }, "from": 0, "size": 200000, "sort": [] }) latest_bugs = Q.select(results.hits.hits, "_source") latest_bugs_index = Q.unique_index( latest_bugs, "bug_id") # IF NOT UNIQUE, THEN ETL IS WRONG for bug_id in private_bugs: if latest_bugs_index[bug_id] == None: Log.error("Expecting to find the private bug {{bug_id}}", {"bug_id": bug_id}) bug_group = latest_bugs_index[bug_id].bug_group if not bug_group: Log.error( "Expecting private bug ({{bug_id}}) to have a bug group", {"bug_id": bug_id}) if BUG_GROUP_FOR_TESTING not in bug_group: Log.error( "Expecting private bug ({{bug_id}}) to have a \"{{bug_group}}\" bug group", { "bug_id": bug_id, "bug_group": BUG_GROUP_FOR_TESTING })
for k, v in CNV.JSON2object(old_alias_json).iteritems(): old_aliases[k] = struct.wrap(v) added = set(compressed.keys()) - set(old_aliases.keys()) removed = set(old_aliases.keys()) - set(compressed.keys()) common = set(compressed.keys()) & set(old_aliases.keys()) changed = set() for c in common: if CNV.object2JSON(compressed[c], pretty=True) != CNV.object2JSON( old_aliases[c], pretty=True): changed.add(c) if added or removed or changed: alias_json = CNV.object2JSON(compressed, pretty=True) file = File(settings.param.alias_file) file.write(alias_json) Log.note("{{num}} of {{total}} aliases saved", { "num": len(compressed.keys()), "total": len(aliases.keys()) }) def start(): try: settings = startup.read_settings() Log.start(settings.debug) main(settings, restart=True) except Exception, e: Log.error("Can not start", e)