def get_existing_ids(es, settings, branches): #FIND WHAT'S IN ES bad_ids = [] int_ids = set() demand_pushlog = {"match_all":{}} if branches: demand_pushlog = {"or": [ {"not": {"missing": {"field": "test_build.push_date"}}}, {"not": {"missing": {"field": "test_build.no_pushlog"}}} ]} if settings.elasticsearch.debug and settings.production.step < 10: # SIMPLY RELOAD THIS SMALL NUMBER return set([]) with ESQuery(es) as esq: max_id = esq.query({ "from": es.settings.alias, "select": {"value": "datazilla.id", "aggregate": "max"} }) interval_size = 200000 for mini, maxi in Q.intervals(settings.production.min, max_id+interval_size, interval_size): existing_ids = es.search({ "query": { "filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"range": {"datazilla.id": {"gte": mini, "lt": maxi}}}, demand_pushlog ]} } }, "from": 0, "size": 0, "sort": [], "facets": { "ids": {"terms": {"field": "datazilla.id", "size": interval_size}} } }) for t in existing_ids.facets.ids.terms: try: int_ids.add(int(t.term)) except Exception, e: bad_ids.append(t.term) existing_ids = int_ids Log.println("Number of ids in ES: " + str(len(existing_ids))) Log.println("BAD ids in ES: " + str(bad_ids)) return existing_ids
def __del__(self): try: Log.println("Branches missing from pushlog:\n{{list}}", {"list": self.unknown_branches}) except Exception, e: pass
if len(line.strip()) == 0: continue col = line.split("\t") id = int(col[0]) if id < MINIMUM_ID: continue json = col[1] if Math.is_number(json): json = col[2] data = CNV.JSON2object(json).json_blob date = CNV.unix2datetime(data.testrun.date) if id % 1000 == 0: Log.println("loading id " + str(id) + " date: " + CNV.datetime2string(date, "%Y-%m-%d %H:%M:%S")) if date < MINIMUM_DATE: continue if id in all: continue all.add(id) arrays_add(id, "[" + data.test_build.branch + "][" + data.testrun.suite + "]", data) output_file.write(str(id) + "\t" + json) except Exception, e: Log.warning("can not process line:\n\t" + line, e) smallest = min(*all) Log.println("First id >= date: {{min}}", {"min": smallest})
try: if content.startswith("Id not found"): Log.note("{{id}} not found {{url}}", {"id": id, "url": url}) if id < max_id: return True else: return False data = CNV.JSON2object(content.decode('utf-8')) content = CNV.object2JSON(data) #ENSURE content HAS NO crlf if data.test_run_id: Log.println("Add {{id}} for revision {{revision}} ({{bytes}} bytes)", { "id": id, "revision": data.json_blob.test_build.revision, "bytes": len(content) }) with Profiler("transform"): result = transformer.transform(id, data) if result: Log.println("{{num}} records to add", { "num": len(result) }) es_sink.extend({"value": d} for d in result) file_sink.add(str(id) + "\t" + content + "\n") elif data.error_flag == 'Y': error = data.json_blob error.datazilla = data