def __init__(self, settings): self.settings = wrap({"host":"fake", "index":"fake"}) self.filename = settings.filename try: self.data = CNV.JSON2object(File(self.filename).read()) except IOError: self.data = Struct()
def main(settings): #MAKE HANDLES TO CONTAINERS with DB(settings.bugzilla) as db: #REAL ES # if settings.candidate.alias is None: # settings.candidate.alias=settings.candidate.index # settings.candidate.index=settings.candidate.alias+CNV.datetime2string(datetime.utcnow(), "%Y%m%d_%H%M%S") # candidate=ElasticSearch.create_index(settings.candidate, File(settings.candidate.schema_file).read()) candidate = Fake_ES(settings.fake_es) reference = ElasticSearch(settings.reference) #SETUP RUN PARAMETERS param = Struct() param.BUGS_TABLE_COLUMNS = get_bugs_table_columns( db, settings.bugzilla.schema) param.BUGS_TABLE_COLUMNS_SQL = SQL(",\n".join( ["`" + c.column_name + "`" for c in param.BUGS_TABLE_COLUMNS])) param.BUGS_TABLE_COLUMNS = Q.select(param.BUGS_TABLE_COLUMNS, "column_name") param.END_TIME = CNV.datetime2milli(datetime.utcnow()) param.START_TIME = 0 param.alias_file = settings.param.alias_file param.BUG_IDS_PARTITION = SQL("bug_id in {{bugs}}", {"bugs": db.quote(settings.param.bugs)}) etl(db, candidate, param) #COMPARE ALL BUGS compare_both(candidate, reference, settings, settings.param.bugs)
def remove_bug_group(db, bug_id, group_name): group_id = db.query("SELECT id FROM groups WHERE name={{name}}", {"name": group_name})[0].id diff(db, "bugs", Struct(bug_id=bug_id, bug_group=group_name), Struct(bug_id=bug_id, bug_group=None)) db.execute( "DELETE FROM bug_group_map WHERE bug_id={{bug_id}} and group_id={{group_id}}", { "bug_id": bug_id, "group_id": group_id })
def test_specific_bugs(self): """ USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs. COMPARE THOSE VERSIONS TO A REFERENCE ES (ALSO CHECKED INTO REPOSITORY) """ # settings.param.allow_private_bugs = True database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.candidate) reference = elasticsearch.open_test_instance( "reference", self.settings.private_bugs_reference) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = self.settings.param.bugs param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(candidate, size=1000) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING compare_both(candidate, reference, self.settings, self.settings.param.bugs)
def test_whiteboard_screened(self): GOOD_BUG_TO_TEST = 1046 database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #MARK BUG AS ONE OF THE SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0]) db.flush() #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([ GOOD_BUG_TO_TEST ]) # bug 1046 sees lots of whiteboard, and other field, changes param.allow_private_bugs = True with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST) for v in versions: if v.status_whiteboard not in (None, "", "[screened]"): Log.error("Expecting whiteboard to be screened")
def main(settings): #MAKE HANDLES TO CONTAINERS with DB(settings.bugzilla) as db: #REAL ES # if settings.candidate.alias is None: # settings.candidate.alias=settings.candidate.index # settings.candidate.index=settings.candidate.alias+CNV.datetime2string(datetime.utcnow(), "%Y%m%d_%H%M%S") # candidate=ElasticSearch.create_index(settings.candidate, File(settings.candidate.schema_file).read()) candidate=Fake_ES(settings.fake_es) reference=ElasticSearch(settings.reference) #SETUP RUN PARAMETERS param=Struct() param.BUGS_TABLE_COLUMNS=get_bugs_table_columns(db, settings.bugzilla.schema) param.BUGS_TABLE_COLUMNS_SQL=SQL(",\n".join(["`"+c.column_name+"`" for c in param.BUGS_TABLE_COLUMNS])) param.BUGS_TABLE_COLUMNS=Q.select(param.BUGS_TABLE_COLUMNS, "column_name") param.END_TIME=CNV.datetime2milli(datetime.utcnow()) param.START_TIME=0 param.alias_file=settings.param.alias_file param.BUG_IDS_PARTITION=SQL("bug_id in {{bugs}}", {"bugs":db.quote(settings.param.bugs)}) etl(db, candidate, param) #COMPARE ALL BUGS compare_both(candidate, reference, settings, settings.param.bugs)
def random_sample_of_bugs(settings): NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(settings.bugzilla) as db: candidate = Fake_ES(settings.fake_es) reference = ElasticSearch(settings.reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [ b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs ] #SETUP RUN PARAMETERS param = Struct() param.BUGS_TABLE_COLUMNS = get_bugs_table_columns( db, settings.bugzilla.schema) param.BUGS_TABLE_COLUMNS_SQL = SQL(",\n".join( ["`" + c.column_name + "`" for c in param.BUGS_TABLE_COLUMNS])) param.BUGS_TABLE_COLUMNS = Q.select(param.BUGS_TABLE_COLUMNS, "column_name") param.END_TIME = CNV.datetime2milli(datetime.utcnow()) param.START_TIME = 0 param.alias_file = settings.param.alias_file param.BUG_IDS_PARTITION = SQL("bug_id in {{bugs}}", {"bugs": db.quote(some_bugs)}) try: etl(db, candidate, param) #COMPARE ALL BUGS found_errors = compare_both(candidate, reference, settings, some_bugs) if found_errors: D.println("Errors found") break else: pass except Exception, e: D.warning("Total faiure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs":some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning("Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def test_incremental_etl_catches_tracking_flags(self): database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME param.start_time = CNV.datetime2milli(CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S")) param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([813650]) param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, 813650) flags = ["cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18"] for v in versions: if v.modified_ts>param.start_time: for f in flags: if v[f] != "fixed": Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})
def test_ambiguous_whiteboard_screened(self): GOOD_BUG_TO_TEST=1046 database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #MARK BUG AS ONE OF THE SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0]) #MARK BUG AS ONE OF THE *NOT* SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, "not screened") db.flush() #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes param.allow_private_bugs = True with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST) for v in versions: if v.status_whiteboard not in (None, "", "[screened]"): Log.error("Expecting whiteboard to be screened")
def test_specific_bugs(self): """ USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs. COMPARE THOSE VERSIONS TO A REFERENCE ES (ALSO CHECKED INTO REPOSITORY) """ # settings.param.allow_private_bugs = True database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate) reference = elasticsearch.open_test_instance("reference", self.settings.private_bugs_reference) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = self.settings.param.bugs param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(candidate, size=1000) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING compare_both(candidate, reference, self.settings, self.settings.param.bugs)
def add_bug_group(db, bug_id, group_name): group_exists = db.query("SELECT id FROM groups WHERE name={{name}}", {"name": group_name}) if not group_exists: db.insert( "groups", { "name": group_name, "description": group_name, "isbuggroup": 1, "userregexp": 0 }) group_exists = db.query("SELECT id FROM groups WHERE name={{name}}", {"name": group_name}) group_id = group_exists[0].id diff(db, "bugs", Struct(bug_id=bug_id, bug_group=None), Struct(bug_id=bug_id, bug_group=group_name)) db.insert("bug_group_map", {"bug_id": bug_id, "group_id": group_id})
def flatten_bugs_record(r, output): for field_name, value in r.items(): if value != "---": newRow = Struct() newRow.bug_id = r.bug_id newRow.modified_ts = r.modified_ts newRow.modified_by = r.modified_by newRow.field_name = field_name newRow.new_value = value newRow._merge_order = 1 output.append(newRow)
class Fake_ES(): def __init__(self, settings): self.settings = wrap({"host":"fake", "index":"fake"}) self.filename = settings.filename try: self.data = CNV.JSON2object(File(self.filename).read()) except IOError: self.data = Struct() def search(self, query): query=wrap(query) f = CNV.esfilter2where(query.query.filtered.filter) filtered=wrap([{"_id": i, "_source": d} for i, d in self.data.items() if f(d)]) if query.fields: return wrap({"hits": {"total":len(filtered), "hits": [{"_id":d._id, "fields":unwrap(Q.select([unwrap(d._source)], query.fields)[0])} for d in filtered]}}) else: return wrap({"hits": {"total":len(filtered), "hits": filtered}}) def extend(self, records): """ JUST SO WE MODEL A Queue """ records = {v["id"]: v["value"] for v in records} struct.unwrap(self.data).update(records) data_as_json = CNV.object2JSON(self.data, pretty=True) File(self.filename).write(data_as_json) Log.note("{{num}} items added", {"num": len(records)}) def add(self, record): if isinstance(record, list): Log.error("no longer accepting lists, use extend()") return self.extend([record]) def delete_record(self, filter): f = CNV.esfilter2where(filter) self.data = wrap({k: v for k, v in self.data.items() if not f(v)}) def set_refresh_interval(self, seconds): pass
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File( settings.param.first_run_time).exists and not File( settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es( settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl( settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write( unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [ b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs ] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs": some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning( "Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def flatten_attachments(data): output = [] for r in data: for k, v in r.items(): if k == "bug_id": continue output.append( Struct( bug_id=r.bug_id, modified_ts=r.modified_ts, modified_by=r.modified_by, field_name=k, new_value=v, #THESE NAMES HAVE DOTS IN THEM attach_id=r.attach_id, _merge_order=7)) return output
def random_sample_of_bugs(settings): NUM_TO_TEST=100 MAX_BUG_ID=900000 with DB(settings.bugzilla) as db: candidate=Fake_ES(settings.fake_es) reference=ElasticSearch(settings.reference) #GO FASTER BY STORING LOCAL FILE local_cache=File(settings.param.temp_dir+"/private_bugs.json") if local_cache.exists: private_bugs=set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs= compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs=[b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs] #SETUP RUN PARAMETERS param=Struct() param.BUGS_TABLE_COLUMNS=get_bugs_table_columns(db, settings.bugzilla.schema) param.BUGS_TABLE_COLUMNS_SQL=SQL(",\n".join(["`"+c.column_name+"`" for c in param.BUGS_TABLE_COLUMNS])) param.BUGS_TABLE_COLUMNS=Q.select(param.BUGS_TABLE_COLUMNS, "column_name") param.END_TIME=CNV.datetime2milli(datetime.utcnow()) param.START_TIME=0 param.alias_file=settings.param.alias_file param.BUG_IDS_PARTITION=SQL("bug_id in {{bugs}}", {"bugs":db.quote(some_bugs)}) try: etl(db, candidate, param) #COMPARE ALL BUGS found_errors=compare_both(candidate, reference, settings, some_bugs) if found_errors: D.println("Errors found") break else: pass except Exception, e: D.warning("Total faiure during compare of bugs {{bugs}}", {"bugs":some_bugs}, e)
def aggregator(data): """ FLATTEN CC LISTS OVER TIME BY BUG MULTISET COUNTS THE NUMBER OF EMAIL AT BUG CREATION NEGATIVE MEANS THERE WAS AN ADD WITHOUT A REMOVE (AND NOT IN CURRENT LIST) """ for d in data: new_emails = Q.map2set(split_email(d.new_value), alias) old_emails = Q.map2set(split_email(d.old_value), alias) for e in new_emails | old_emails: details = aliases.get(e, Struct()) aliases[e] = details agg = bugs.get(d.bug_id, Multiset(allow_negative=True)) agg = agg - new_emails agg = agg + old_emails bugs[d.bug_id] = agg
def test_specific_bugs(self): """ USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs. """ with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.elasticsearch) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = self.settings.param.bugs param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(candidate, size=1000) as output: etl(db, output, param, please_stop=None)
def test_incremental_etl_catches_tracking_flags(self): database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME param.start_time = CNV.datetime2milli( CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S")) param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([813650]) param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, 813650) flags = [ "cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18" ] for v in versions: if v.modified_ts > param.start_time: for f in flags: if v[f] != "fixed": Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})
def test_specific_bugs(self): """ USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs. """ with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance("candidate", self.settings.elasticsearch) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = self.settings.param.bugs param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(candidate, size=1000) as output: etl(db, output, param, please_stop=None)
def diff(db, table, old_record, new_record): """ UPDATE bugs_activity WITH THE CHANGES IN RECORDS """ now = milli2string(db, CNV.datetime2milli(get_current_time(db))) changed = set(old_record.keys()) ^ set(new_record.keys()) changed |= set([k for k, v in old_record.items() if v != new_record[k]]) if table != u"bugs": prefix = table + u"." else: prefix = u"" for c in changed: fieldid = db.query( "SELECT id FROM fielddefs WHERE name={{field_name}}", {"field_name": prefix + c})[0].id if fieldid == None: Log.error("Expecting a valid field name") activity = Struct(bug_id=old_record.bug_id, who=1, bug_when=now, fieldid=fieldid, removed=old_record[c], added=new_record[c], attach_id=old_record.attach_id, comment_id=old_record.comment_id) db.insert("bugs_activity", activity) db.execute( "UPDATE bugs SET delta_ts={{now}} WHERE {{where}}", { "now": now, "where": esfilter2sqlwhere(db, {"term": { "bug_id": old_record.bug_id }}) })
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File(settings.param.first_run_time).exists and not File(settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es(settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl(settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write(unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def test_incremental_has_correct_expires_on(self): # 813650, 726635 BOTH HAVE CHANGES IN 2013 bugs = struct.wrap([813650, 726635]) start_incremental=CNV.datetime2milli(CNV.string2datetime("2013-01-01", "%Y-%m-%d")) es = elasticsearch.make_test_instance("candidate", self.settings.candidate) with DB(self.settings.bugzilla) as db: #SETUP FIRST RUN PARAMETERS param = Struct() param.end_time = start_incremental param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) #SETUP INCREMENTAL RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(datetime.utcnow()) param.start_time = start_incremental param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) for b in bugs: results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and":[ {"term":{"bug_id":b}}, {"range":{"expires_on":{"gte":CNV.datetime2milli(datetime.utcnow())}}} ]} }}, "from": 0, "size": 200000, "sort": [], "fields": ["bug_id"] }) if results.hits.total>1: Log.error("Expecting only one active bug_version record")
def test_incremental_has_correct_expires_on(self): # 813650, 726635 BOTH HAVE CHANGES IN 2013 bugs = struct.wrap([813650, 726635]) start_incremental = CNV.datetime2milli( CNV.string2datetime("2013-01-01", "%Y-%m-%d")) es = elasticsearch.make_test_instance("candidate", self.settings.candidate) with DB(self.settings.bugzilla) as db: #SETUP FIRST RUN PARAMETERS param = Struct() param.end_time = start_incremental param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) #SETUP INCREMENTAL RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(datetime.utcnow()) param.start_time = start_incremental param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) for b in bugs: results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "term": { "bug_id": b } }, { "range": { "expires_on": { "gte": CNV.datetime2milli(datetime.utcnow()) } } }] } } }, "from": 0, "size": 200000, "sort": [], "fields": ["bug_id"] }) if results.hits.total > 1: Log.error("Expecting only one active bug_version record")