def __init__(self, settings): self.settings = wrap({"host":"fake", "index":"fake"}) self.filename = settings.filename try: self.data = CNV.JSON2object(File(self.filename).read()) except IOError: self.data = Struct()
def main(settings): file = File(settings.param.alias_file) aliases = CNV.JSON2object(file.read()) for v in aliases.values(): v.candidates = CNV.dict2Multiset(v.candidates) data = [{ "lost": n, "found": d.canonical } for n, d in aliases.items() if d.canonical != None and n != d.canonical] sorted = Q.sort(data, "found") for s in sorted: Log.note("{{found}} == {{lost}}", s) clean = { n: d.canonical for n, d in aliases.items() if d.canonical != None and n != d.canonical and n != "" } rev_clean = struct.inverse(clean) Log.note(CNV.object2JSON(rev_clean, pretty=True)) for k, v in rev_clean.items(): if len(v) > 3: Log.note(CNV.object2JSON({k: v}, pretty=True))
def rename_attachments(bug_version): if bug_version.attachments == None: return bug_version if not USE_ATTACHMENTS_DOT: bug_version.attachments = CNV.JSON2object( CNV.object2JSON(bug_version.attachments).replace( "attachments.", "attachments_")) return bug_version
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [ b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs ] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs": some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning( "Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def old2new(bug, max_date): """ CONVERT THE OLD ES FORMAT TO THE NEW THESE ARE KNOWN CHANGES THAT SHOULD BE MADE TO THE PRODUCTION VERSION """ if bug.everconfirmed != None: if bug.everconfirmed == "": bug.everconfirmed = None else: bug.everconfirmed = int(bug.everconfirmed) bug = CNV.JSON2object(CNV.object2JSON(bug).replace("bugzilla: other b.m.o issues ", "bugzilla: other b.m.o issues")) if bug.expires_on > max_date: bug.expires_on = parse_bug_history.MAX_TIME if bug.votes != None: bug.votes = int(bug.votes) bug.dupe_by = CNV.value2intlist(bug.dupe_by) if bug.votes == 0: del bug["votes"] # if Math.is_integer(bug.remaining_time) and int(bug.remaining_time) == 0: # bug.remaining_time = 0 if bug.cf_due_date != None and not Math.is_number(bug.cf_due_date): bug.cf_due_date = CNV.datetime2milli( CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d") ) bug.changes = CNV.JSON2object( CNV.object2JSON(Q.sort(bug.changes, "field_name")) \ .replace("\"field_value_removed\":", "\"old_value\":") \ .replace("\"field_value\":", "\"new_value\":") ) if bug.everconfirmed == 0: del bug["everconfirmed"] if bug.id == "692436_1336314345": bug.votes = 3 try: if Math.is_number(bug.cf_last_resolved): bug.cf_last_resolved = long(bug.cf_last_resolved) else: bug.cf_last_resolved = CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S")) except Exception, e: pass
def random_sample_of_bugs(settings): NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(settings.bugzilla) as db: candidate = Fake_ES(settings.fake_es) reference = ElasticSearch(settings.reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [ b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs ] #SETUP RUN PARAMETERS param = Struct() param.BUGS_TABLE_COLUMNS = get_bugs_table_columns( db, settings.bugzilla.schema) param.BUGS_TABLE_COLUMNS_SQL = SQL(",\n".join( ["`" + c.column_name + "`" for c in param.BUGS_TABLE_COLUMNS])) param.BUGS_TABLE_COLUMNS = Q.select(param.BUGS_TABLE_COLUMNS, "column_name") param.END_TIME = CNV.datetime2milli(datetime.utcnow()) param.START_TIME = 0 param.alias_file = settings.param.alias_file param.BUG_IDS_PARTITION = SQL("bug_id in {{bugs}}", {"bugs": db.quote(some_bugs)}) try: etl(db, candidate, param) #COMPARE ALL BUGS found_errors = compare_both(candidate, reference, settings, some_bugs) if found_errors: D.println("Errors found") break else: pass except Exception, e: D.warning("Total faiure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def main(settings): #USE A FILE if settings.source.filename != None: settings.destination.alias = settings.destination.index settings.destination.index = ElasticSearch.proto_name(settings.destination.alias) schema = CNV.JSON2object(File(settings.source.schema_filename).read()) if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = CNV.JSON2object(CNV.object2JSON(schema).replace("attachments_", "attachments.")) dest = ElasticSearch.create_index(settings.destination, schema, limit_replicas=True) dest.set_refresh_interval(-1) extract_from_file(settings.source, dest) dest.set_refresh_interval(1) dest.delete_all_but(settings.destination.alias, settings.destination.index) dest.add_alias(settings.destination.alias) return # SYNCH WITH source ES INDEX source=ElasticSearch(settings.source) destination=get_or_create_index(settings["destination"], source) # GET LAST UPDATED time_file = File(settings.param.last_replication_time) from_file = None if time_file.exists: from_file = CNV.milli2datetime(CNV.value2int(time_file.read())) from_es = get_last_updated(destination) last_updated = nvl(MIN(from_file, from_es), CNV.milli2datetime(0)) current_time = datetime.utcnow() pending = get_pending(source, last_updated) with ThreadedQueue(destination, size=1000) as data_sink: replicate(source, data_sink, pending, last_updated) # RECORD LAST UPDATED time_file.write(unicode(CNV.datetime2milli(current_time)))
def open_test_instance(name, settings): if settings.filename: Log.note("Using {{filename}} as {{type}}", { "filename": settings.filename, "type": name }) return Fake_ES(settings) else: Log.note("Using ES cluster at {{host}} as {{type}}", { "host": settings.host, "type": name }) ElasticSearch.delete_index(settings) schema = CNV.JSON2object(File(settings.schema_file).read(), flexible=True, paths=True) es = ElasticSearch.create_index(settings, schema, limit_replicas=True) return es
def extract_from_file(source_settings, destination): with File(source_settings.filename) as handle: for g, d in Q.groupby(handle, size=BATCH_SIZE): try: d2 = map( lambda (x): {"id": x.id, "value": x}, map( lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))), d ) ) destination.add(d2) except Exception, e: filename = "Error_" + unicode(g) + ".txt" File(filename).write(d) Log.warning("Can not convert block {{block}} (file={{host}})", { "block": g, "filename": filename }, e)
def loadAliases(settings): try: try: with Timer( "load alias file at {{filename}}", { "filename": nvl(settings.param.alias_file.path, settings.param.alias_file) }): alias_json = File(settings.param.alias_file).read() except Exception, e: Log.warning( "No alias file found (looking at {{filename}}", { "filename": nvl(settings.param.alias_file.path, settings.param.alias_file) }) alias_json = "{}" #self.aliases IS A dict POINTING TO structs for k, v in CNV.JSON2object(alias_json).iteritems(): aliases[k] = struct.wrap(v) Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def __init__(self, settings): self.filename = settings.filename try: self.data = CNV.JSON2object(File(self.filename).read()) except IOError: self.data = {}
settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = temp.last() es_comments = ElasticSearch(settings.es_comments) except Exception, e: Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments) else: # START ETL FROM BEGINNING, MAKE NEW INDEX last_run_time = 0 if not es: # BUG VERSIONS schema = File(settings.es.schema_file).read() if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = schema.replace("attachments_", "attachments\\.") schema = CNV.JSON2object(schema, paths=True) schema.settings = jsons.expand_dot(schema.settings) if not settings.es.alias: settings.es.alias = settings.es.index settings.es.index = ElasticSearch.proto_name(settings.es.alias) es = ElasticSearch.create_index(settings.es, schema, limit_replicas=True) # BUG COMMENTS comment_schema = File(settings.es_comments.schema_file).read() comment_schema = CNV.JSON2object(comment_schema, paths=True) comment_schema.settings = jsons.expand_dot(comment_schema.settings) if not settings.es_comments.alias: settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = ElasticSearch.proto_name(
def normalize(bug, old_school=False): bug = bug.copy() bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3] bug._id = None #ENSURE STRUCTURES ARE SORTED # Do some processing to make sure that diffing between runs stays as similar as possible. bug.flags = Q.sort(bug.flags, "value") if bug.attachments: if USE_ATTACHMENTS_DOT: bug.attachments = CNV.JSON2object( CNV.object2JSON(bug.attachments).replace( "attachments_", "attachments.")) bug.attachments = Q.sort(bug.attachments, "attach_id") for a in bug.attachments: for k, v in list(a.items()): if k.startswith("attachments") and (k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate")): new_v = CNV.value2int(v) new_k = k[12:] a[k.replace(".", "\.")] = new_v if not old_school: a[new_k] = new_v a.flags = Q.sort(a.flags, ["modified_ts", "value"]) if bug.changes != None: if USE_ATTACHMENTS_DOT: json = CNV.object2JSON(bug.changes).replace( "attachments_", "attachments.") bug.changes = CNV.JSON2object(json) bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"]) #bug IS CONVERTED TO A 'CLEAN' COPY bug = ElasticSearch.scrub(bug) # bug.attachments = nvl(bug.attachments, []) # ATTACHMENTS MUST EXIST for f in NUMERIC_FIELDS: v = bug[f] if v == None: continue elif f in MULTI_FIELDS: bug[f] = CNV.value2intlist(v) elif CNV.value2number(v) == 0: del bug[f] else: bug[f] = CNV.value2number(v) # Also reformat some date fields for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]: v = bug[dateField] if v == None: continue try: if isinstance(v, date): bug[dateField] = CNV.datetime2milli(v) elif isinstance(v, long) and len(unicode(v)) in [12, 13]: bug[dateField] = v elif not isinstance(v, basestring): Log.error("situation not handled") elif DATE_PATTERN_STRICT.match(v): # Convert to "2012/01/01 00:00:00.000" # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v + "000", "%Y/%m/%d %H:%M%:S%f")) elif DATE_PATTERN_STRICT_SHORT.match(v): # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp. # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S")) elif DATE_PATTERN_RELAXED.match(v): # Convert "2012/01/01 00:00:00.000" to "2012-01-01" # Example: bug 643420 (deadline) # bug 726635 (cf_due_date) bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v[0:10], "%Y-%m-%d")) except Exception, e: Log.error( "problem with converting date to milli (value={{value}})", {"value": bug[dateField]}, e)
Log.error("Can not init aliases", e) def saveAliases(settings): compressed = { email: details for email, details in aliases.iteritems() if details.canonical } #COMPARE WITH PREVIOUS ALIAS VERSION try: old_alias_json = File(settings.param.alias_file).read() except Exception, e: old_alias_json = "{}" old_aliases = {} for k, v in CNV.JSON2object(old_alias_json).iteritems(): old_aliases[k] = struct.wrap(v) added = set(compressed.keys()) - set(old_aliases.keys()) removed = set(old_aliases.keys()) - set(compressed.keys()) common = set(compressed.keys()) & set(old_aliases.keys()) changed = set() for c in common: if CNV.object2JSON(compressed[c], pretty=True) != CNV.object2JSON( old_aliases[c], pretty=True): changed.add(c) if added or removed or changed: alias_json = CNV.object2JSON(compressed, pretty=True) file = File(settings.param.alias_file)