def search(self, query): query=wrap(query) f = CNV.esfilter2where(query.query.filtered.filter) filtered=wrap([{"_id": i, "_source": d} for i, d in self.data.items() if f(d)]) if query.fields: return wrap({"hits": {"total":len(filtered), "hits": [{"_id":d._id, "fields":unwrap(Q.select([unwrap(d._source)], query.fields)[0])} for d in filtered]}}) else: return wrap({"hits": {"total":len(filtered), "hits": filtered}})
def search(self, query): filter = parse_filter(wrap(query).query.filtered.filter) return wrap({ "hits": { "hits": [{ "_id": i, "_source": d } for i, d in self.data.items() if filter(d)] } })
def test_ambiguous_whiteboard_screened(self): GOOD_BUG_TO_TEST=1046 database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #MARK BUG AS ONE OF THE SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0]) #MARK BUG AS ONE OF THE *NOT* SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, "not screened") db.flush() #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes param.allow_private_bugs = True with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST) for v in versions: if v.status_whiteboard not in (None, "", "[screened]"): Log.error("Expecting whiteboard to be screened")
def __init__(self, settings): self.settings = wrap({"host":"fake", "index":"fake"}) self.filename = settings.filename try: self.data = CNV.JSON2object(File(self.filename).read()) except IOError: self.data = Struct()
def test_whiteboard_screened(self): GOOD_BUG_TO_TEST = 1046 database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #MARK BUG AS ONE OF THE SCREENED GROUPS database.add_bug_group(db, GOOD_BUG_TO_TEST, SCREENED_WHITEBOARD_BUG_GROUPS[0]) db.flush() #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([ GOOD_BUG_TO_TEST ]) # bug 1046 sees lots of whiteboard, and other field, changes param.allow_private_bugs = True with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, GOOD_BUG_TO_TEST) for v in versions: if v.status_whiteboard not in (None, "", "[screened]"): Log.error("Expecting whiteboard to be screened")
def test_incremental_etl_catches_tracking_flags(self): database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME param.start_time = CNV.datetime2milli(CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S")) param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([813650]) param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, 813650) flags = ["cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18"] for v in versions: if v.modified_ts>param.start_time: for f in flags: if v[f] != "fixed": Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})
def loadAliases(settings): try: try: with Timer("load alias file at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}): alias_json = File(settings.param.alias_file).read() except Exception, e: Log.warning("No alias file found (looking at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}) alias_json = "{}" #self.aliases IS A dict POINTING TO structs for k, v in CNV.JSON2object(alias_json).iteritems(): aliases[k] = struct.wrap(v) Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def test_incremental_has_correct_expires_on(self): # 813650, 726635 BOTH HAVE CHANGES IN 2013 bugs = struct.wrap([813650, 726635]) start_incremental=CNV.datetime2milli(CNV.string2datetime("2013-01-01", "%Y-%m-%d")) es = elasticsearch.make_test_instance("candidate", self.settings.candidate) with DB(self.settings.bugzilla) as db: #SETUP FIRST RUN PARAMETERS param = Struct() param.end_time = start_incremental param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) #SETUP INCREMENTAL RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(datetime.utcnow()) param.start_time = start_incremental param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) for b in bugs: results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and":[ {"term":{"bug_id":b}}, {"range":{"expires_on":{"gte":CNV.datetime2milli(datetime.utcnow())}}} ]} }}, "from": 0, "size": 200000, "sort": [], "fields": ["bug_id"] }) if results.hits.total>1: Log.error("Expecting only one active bug_version record")
def get(es, esfilter, fields=None, limit=None): query = struct.wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": esfilter }}, "from": 0, "size": nvl(limit, 200000), "sort": [] }) if fields: query.fields=fields results = es.search(query) return Q.select(results.hits.hits, "fields") else: results = es.search(query) return Q.select(results.hits.hits, "_source")
def etl(db, output_queue, param, please_stop): """ PROCESS RANGE, AS SPECIFIED IN param AND PUSH BUG VERSION RECORDS TO output_queue """ # CONNECTIONS ARE EXPENSIVE, CACHE HERE with db_cache_lock: if not db_cache: with Timer("open connections to db"): for f in get_stuff_from_bugzilla: db = DB(db) db_cache.append(db) db_results = Queue(max=2**30) with db_cache_lock: # ASYMMETRIC MULTI THREADING TO GET RECORDS FROM DB with AllThread() as all: for i, f in enumerate(get_stuff_from_bugzilla): def process(target, db, param, please_stop): db_results.extend(target(db, param)) all.add(process, f, db_cache[i], param.copy()) db_results.add(Thread.STOP) sorted = Q.sort(db_results, [ "bug_id", "_merge_order", { "field": "modified_ts", "sort": -1 }, "modified_by" ]) process = BugHistoryParser(param, output_queue) for s in sorted: process.processRow(s) process.processRow( struct.wrap({ "bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1 }))
def get(es, esfilter, fields=None, limit=None): query = struct.wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": esfilter } }, "from": 0, "size": nvl(limit, 200000), "sort": [] }) if fields: query.fields = fields results = es.search(query) return Q.select(results.hits.hits, "fields") else: results = es.search(query) return Q.select(results.hits.hits, "_source")
def loadAliases(settings): try: try: with Timer( "load alias file at {{filename}}", { "filename": nvl(settings.param.alias_file.path, settings.param.alias_file) }): alias_json = File(settings.param.alias_file).read() except Exception, e: Log.warning( "No alias file found (looking at {{filename}}", { "filename": nvl(settings.param.alias_file.path, settings.param.alias_file) }) alias_json = "{}" #self.aliases IS A dict POINTING TO structs for k, v in CNV.JSON2object(alias_json).iteritems(): aliases[k] = struct.wrap(v) Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def test_incremental_etl_catches_tracking_flags(self): database.make_test_instance(self.settings.bugzilla) with DB(self.settings.bugzilla) as db: es = elasticsearch.make_test_instance("candidate", self.settings.candidate) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME param.start_time = CNV.datetime2milli( CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S")) param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = struct.wrap([813650]) param.allow_private_bugs = self.settings.param.allow_private_bugs with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING versions = get_all_bug_versions(es, 813650) flags = [ "cf_status_firefox18", "cf_status_firefox19", "cf_status_firefox_esr17", "cf_status_b2g18" ] for v in versions: if v.modified_ts > param.start_time: for f in flags: if v[f] != "fixed": Log.error("813650 should have {{flag}}=='fixed'", {"flag": f})
def etl(db, output_queue, param, please_stop): """ PROCESS RANGE, AS SPECIFIED IN param AND PUSH BUG VERSION RECORDS TO output_queue """ # CONNECTIONS ARE EXPENSIVE, CACHE HERE with db_cache_lock: if not db_cache: with Timer("open connections to db"): for f in get_stuff_from_bugzilla: db = DB(db) db_cache.append(db) db_results = Queue(max=2**30) with db_cache_lock: # ASYMMETRIC MULTI THREADING TO GET RECORDS FROM DB with AllThread() as all: for i, f in enumerate(get_stuff_from_bugzilla): def process(target, db, param, please_stop): db_results.extend(target(db, param)) all.add(process, f, db_cache[i], param.copy()) db_results.add(Thread.STOP) sorted = Q.sort(db_results, [ "bug_id", "_merge_order", {"field": "modified_ts", "sort": -1}, "modified_by" ]) process = BugHistoryParser(param, output_queue) for s in sorted: process.processRow(s) process.processRow(struct.wrap({"bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1}))
def delete_record(self, filter): f = CNV.esfilter2where(filter) self.data = wrap({k: v for k, v in self.data.items() if not f(v)})
def saveAliases(settings): compressed = { email: details for email, details in aliases.iteritems() if details.canonical } #COMPARE WITH PREVIOUS ALIAS VERSION try: old_alias_json = File(settings.param.alias_file).read() except Exception, e: old_alias_json = "{}" old_aliases = {} for k, v in CNV.JSON2object(old_alias_json).iteritems(): old_aliases[k] = struct.wrap(v) added = set(compressed.keys()) - set(old_aliases.keys()) removed = set(old_aliases.keys()) - set(compressed.keys()) common = set(compressed.keys()) & set(old_aliases.keys()) changed = set() for c in common: if CNV.object2JSON(compressed[c], pretty=True) != CNV.object2JSON(old_aliases[c], pretty=True): changed.add(c) if added or removed or changed: alias_json = CNV.object2JSON(compressed, pretty=True) file = File(settings.param.alias_file) file.write(alias_json)
def saveAliases(settings): compressed = { email: details for email, details in aliases.iteritems() if details.canonical } #COMPARE WITH PREVIOUS ALIAS VERSION try: old_alias_json = File(settings.param.alias_file).read() except Exception, e: old_alias_json = "{}" old_aliases = {} for k, v in CNV.JSON2object(old_alias_json).iteritems(): old_aliases[k] = struct.wrap(v) added = set(compressed.keys()) - set(old_aliases.keys()) removed = set(old_aliases.keys()) - set(compressed.keys()) common = set(compressed.keys()) & set(old_aliases.keys()) changed = set() for c in common: if CNV.object2JSON(compressed[c], pretty=True) != CNV.object2JSON( old_aliases[c], pretty=True): changed.add(c) if added or removed or changed: alias_json = CNV.object2JSON(compressed, pretty=True) file = File(settings.param.alias_file) file.write(alias_json)
def test_incremental_has_correct_expires_on(self): # 813650, 726635 BOTH HAVE CHANGES IN 2013 bugs = struct.wrap([813650, 726635]) start_incremental = CNV.datetime2milli( CNV.string2datetime("2013-01-01", "%Y-%m-%d")) es = elasticsearch.make_test_instance("candidate", self.settings.candidate) with DB(self.settings.bugzilla) as db: #SETUP FIRST RUN PARAMETERS param = Struct() param.end_time = start_incremental param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) #SETUP INCREMENTAL RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(datetime.utcnow()) param.start_time = start_incremental param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = self.settings.param.alias_file param.bug_list = bugs param.allow_private_bugs = False with ThreadedQueue(es, size=1000) as output: etl(db, output, param, please_stop=None) for b in bugs: results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "term": { "bug_id": b } }, { "range": { "expires_on": { "gte": CNV.datetime2milli(datetime.utcnow()) } } }] } } }, "from": 0, "size": 200000, "sort": [], "fields": ["bug_id"] }) if results.hits.total > 1: Log.error("Expecting only one active bug_version record")
def search(self, query): filter=parse_filter(wrap(query).query.filtered.filter) return wrap({"hits":{"hits":[{"_id":i, "_source":d} for i,d in self.data.items() if filter(d)]}})