def filter(self, where): if len(self.edges) == 1 and self.edges[0].domain.type == "index": # USE THE STANDARD LIST FILTER from pyLibrary.queries import jx return jx.filter(self.data.values()[0].cube, where) else: # FILTER DOES NOT ALTER DIMESIONS, JUST WHETHER THERE ARE VALUES IN THE CELLS Log.unexpected("Incomplete")
def filter(self, where): if len(self.edges)==1 and self.edges[0].domain.type=="index": # USE THE STANDARD LIST FILTER from pyLibrary.queries import jx return jx.filter(self.data.values()[0].cube, where) else: # FILTER DOES NOT ALTER DIMESIONS, JUST WHETHER THERE ARE VALUES IN THE CELLS Log.unexpected("Incomplete")
def fix(rownum, line, source, sample_only_filter, sample_size): # ES SCHEMA IS STRICTLY TYPED, USE "code" FOR TEXT IDS line = line.replace('{"id": "bb"}', '{"code": "bb"}').replace('{"id": "tc"}', '{"code": "tc"}') # ES SCHEMA IS STRICTLY TYPED, THE SUITE OBJECT CAN NOT BE HANDLED if source.name.startswith("active-data-test-result"): # "suite": {"flavor": "plain-chunked", "name": "mochitest"} found = strings.between(line, '"suite": {', '}') if found: suite_json = '{' + found + "}" if suite_json: suite = convert.json2value(suite_json) suite = convert.value2json(suite.name) line = line.replace(suite_json, suite) if rownum == 0: value = convert.json2value(line) if len(line) > 100000: value.result.subtests = [ s for s in value.result.subtests if s.ok is False ] value.result.missing_subtests = True _id, value = _fix(value) row = {"id": _id, "value": value} if sample_only_filter and Random.int( int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter( [value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") return row, True elif len(line) > 100000: value = convert.json2value(line) value.result.subtests = [ s for s in value.result.subtests if s.ok is False ] value.result.missing_subtests = True _id, value = _fix(value) row = {"id": _id, "value": value} elif line.find("\"resource_usage\":") != -1: value = convert.json2value(line) _id, value = _fix(value) row = {"id": _id, "value": value} else: # FAST _id = strings.between(line, "\"_id\": \"", "\"") # AVOID DECODING JSON row = {"id": _id, "json": line} return row, False
def update(self, command): try: command = wrap(command) eq = command.where.eq if eq.es_index: columns = self.find(eq.es_index, eq.name) columns = [c for c in columns if all(get_attr(c, k) == v for k, v in eq.items())] else: columns = list(self) columns = jx.filter(columns, command.where) for col in columns: for k in command["clear"]: col[k] = None for k, v in command.set.items(): col[k] = v except Exception as e: Log.error("sould not happen", cause=e)
def agg(today, destination, debug_filter=None, please_stop=None): """ :param today: The day we are performing the calculation for :param destination: The ES index where we put the results :param debug_filter: Some extra limitation to go faster, and focus, for testing :param please_stop: Signal for stopping early :return: nothing """ # GET LIST OF ALL TESTS, BY PLATFORM, TYPE, SUITE for suite in SUITES: domain = {"and": [ {"prefix": {"run.suite": suite}}, {"gt": {"build.date": (today - 3 * DAY).unix}}, {"lt": {"build.date": (today + 4 * DAY).unix}}, {"exists": "build.platform"}, {"not": {"in": {"build.platform": EXCLUDE_PLATFORMS}}}, {"not": {"in": {"build.branch": EXCLUDE_BRANCHES}}} ]} if debug_filter: domain['and'].append(debug_filter) _ = convert.value2json("\"\"") # WE CAN NOT PULL ALL TESTS, THERE ARE TOO MANY, SO DO ONE SUITE AT A TIME Log.note("Get summary of failures in {{suite}} for date {{date}}", suite=suite, date=today) suite_summary = http.post_json(config.source.url, json={ "from": "unittest", "groupby": [ {"name": "test", "value": "result.test"} ], "where": {"and": [ domain, {"eq": {"result.ok": False}} ]}, "format": "list", "limit": 100000 }) often_fail = jx.filter(suite_summary.data, {"gt": {"count": 1}}) for g, tests in jx.groupby(often_fail, size=100): tests = wrap(tests) if please_stop: return Log.note("Collect stats on {{num}} tests", num=len(tests)) tests_summary = http.post_json(config.source.url, json={ "from": "unittest", "groupby": [ "run.suite", {"name": "test", "value": "result.test"}, "build.platform", "build.product", "build.type", "run.type" ], "select": [ { "name": "date_fails", "value": { "mult": [ {"div": [{"sub": {"build.date": today + 0.5 * DAY}}, DAY.seconds]}, {"when": "result.ok", "then": 0, "else": 1} ] }, "aggregate": "stats" }, { "name": "date", "value": {"div": [{"sub": {"build.date": today + 0.5 * DAY}}, DAY.seconds]}, "aggregate": "stats" }, { "name": "fails", "value": {"when": "result.ok", "then": 0, "else": 1}, "aggregate": "stats" } ], "where": {"and": [ domain, {"in": {"result.test": tests}} ]}, "format": "list", "limit": 100000 }) # FOR EACH TEST, CALCULATE THE "RECENTLY BAD" STATISTIC (linear regression slope) # THIS IS ONLY A ROUGH CALC FOR TESTING THE UI for t in tests_summary.data: try: t._id = "-".join([ coalesce(t.build.product, ""), t.build.platform, coalesce(t.build.type, ""), coalesce(t.run.type, ""), t.run.suite, t.test, unicode(today.unix) ]) except Exception, e: Log.error("text join problem", cause=e) t.timestamp = today t.average = t.fails.avg if t.date.var == 0: t.slope = 0 else: t.slope = (t.date_fails.avg - t.date.avg * t.fails.avg) / t.date.var t.etl.timestamp = Date.now() # PUSH STATS TO ES docs = [{"id": t._id, "value": t} for t in tests_summary.data if t.fails.sum > 0] Log.note("Adding {{num}} test summaries", num=len(docs)) destination.extend(docs)
def process_batch(todo, coverage_index, coverage_summary_index, settings, please_stop): for not_summarized in todo: if please_stop: return True # IS THERE MORE THAN ONE COVERAGE FILE FOR THIS REVISION? Log.note("Find dups for file {{file}}", file=not_summarized.source.file.name) dups = http.post_json(settings.url, json={ "from": "coverage", "select": [ {"name": "max_id", "value": "etl.source.id", "aggregate": "max"}, {"name": "min_id", "value": "etl.source.id", "aggregate": "min"} ], "where": {"and": [ {"missing": "source.method.name"}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, ]}, "groupby": [ "test.url" ], "limit": 100000, "format": "list" }) dups_found = False for d in dups.data: if d.max_id != d.min_id: dups_found = True Log.note( "removing dups {{details|json}}\n{{dups|json|indent}}", details={ "id": int(d.max_id), "test": d.test.url, "source": not_summarized.source.file.name, "revision": not_summarized.build.revision12 } ) # FIND ALL INDEXES all_indexes = [ p.index for p in coverage_index.cluster.get_aliases() if p.alias == coverage_index.settings.alias ] for index_name in all_indexes: elasticsearch.Index(index=index_name, read_only=False, cluster=coverage_index.cluster).delete_record({"and": [ {"not": {"term": {"etl.source.id": int(d.max_id)}}}, {"term": {"test.url": d.test.url}}, {"term": {"source.file.name": not_summarized.source.file.name}}, {"term": {"build.revision12": not_summarized.build.revision12}} ]}) if dups_found: continue # LIST ALL TESTS THAT COVER THIS FILE, AND THE LINES COVERED test_count = http.post_json(settings.url, json={ "from": "coverage.source.file.covered", "where": {"and": [ {"missing": "source.method.name"}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, ]}, "groupby": [ "test.url", "line" ], "limit": 100000, "format": "list" }) all_tests_covering_file = UNION(test_count.data.get("test.url")) num_tests = len(all_tests_covering_file) max_siblings = num_tests - 1 Log.note( "{{filename}} rev {{revision}} is covered by {{num}} tests", filename=not_summarized.source.file.name, num=num_tests, revision=not_summarized.build.revision12 ) line_summary = list( (k, unwrap(wrap(list(v)).get("test.url"))) for k, v in jx.groupby(test_count.data, keys="line") ) # PULL THE RAW RECORD FOR MODIFICATION file_level_coverage_records = http.post_json(settings.url, json={ "from": "coverage", "where": {"and": [ {"missing": "source.method.name"}, {"in": {"test.url": all_tests_covering_file}}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }} ]}, "limit": 100000, "format": "list" }) for test_name in all_tests_covering_file: siblings = [len(test_names)-1 for g, test_names in line_summary if test_name in test_names] min_siblings = MIN(siblings) coverage_candidates = jx.filter(file_level_coverage_records.data, lambda row, rownum, rows: row.test.url == test_name) if coverage_candidates: if len(coverage_candidates) > 1 and any(coverage_candidates[0]._id != c._id for c in coverage_candidates): Log.warning( "Duplicate coverage\n{{cov|json|indent}}", cov=[{"_id": c._id, "run": c.run, "test": c.test} for c in coverage_candidates] ) # MORE THAN ONE COVERAGE CANDIDATE CAN HAPPEN WHEN THE SAME TEST IS IN TWO DIFFERENT CHUNKS OF THE SAME SUITE for coverage_record in coverage_candidates: coverage_record.source.file.max_test_siblings = max_siblings coverage_record.source.file.min_line_siblings = min_siblings coverage_record.source.file.score = (max_siblings - min_siblings) / (max_siblings + min_siblings + 1) else: example = http.post_json(settings.url, json={ "from": "coverage", "where": {"eq": { "test.url": test_name, "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, "limit": 1, "format": "list" }) Log.warning( "{{test|quote}} rev {{revision}} appears to have no coverage for {{file|quote}}!\n{{example|json|indent}}", test=test_name, file=not_summarized.source.file.name, revision=not_summarized.build.revision12, example=example.data[0] ) bad_example = [d for d in file_level_coverage_records.data if d["source.file.min_line_siblings"] == None] if bad_example: Log.warning("expecting all records to have summary. Example:\n{{example}}", example=bad_example[0]) rows = [{"id": d._id, "value": d} for d in file_level_coverage_records.data] coverage_summary_index.extend(rows) coverage_index.extend(rows) all_test_summary = [] for g, records in jx.groupby(file_level_coverage_records.data, "source.file.name"): cov = UNION(records.source.file.covered) uncov = UNION(records.source.file.uncovered) coverage = { "_id": "|".join([records[0].build.revision12, g["source.file.name"]]), # SOMETHING UNIQUE, IN CASE WE RECALCULATE "source": { "file": { "name": g["source.file.name"], "is_file": True, "covered": jx.sort(cov, "line"), "uncovered": jx.sort(uncov), "total_covered": len(cov), "total_uncovered": len(uncov), "min_line_siblings": 0 # PLACEHOLDER TO INDICATE DONE } }, "build": records[0].build, "repo": records[0].repo, "run": records[0].run, "etl": {"timestamp": Date.now()} } all_test_summary.append(coverage) sum_rows = [{"id": d["_id"], "value": d} for d in all_test_summary] coverage_summary_index.extend(sum_rows) if DEBUG: coverage_index.refresh() todo = http.post_json(settings.url, json={ "from": "coverage", "where": {"and": [ {"missing": "source.method.name"}, {"missing": "source.file.min_line_siblings"}, {"eq": {"source.file.name": not_summarized.source.file.name}}, {"eq": {"build.revision12": not_summarized.build.revision12}} ]}, "format": "list", "limit": 10 }) if todo.data: Log.error("Failure to update")
def _get_spot_prices_from_aws(self): with Timer("Read pricing file"): try: content = File(self.settings.price_file).read() cache = convert.json2value(content, flexible=False, leaves=False) except Exception as e: cache = FlatList() most_recents = jx.run({ "from": cache, "edges": ["instance_type", "availability_zone"], "select": {"value": "timestamp", "aggregate": "max"} }) zones = self._get_valid_availability_zones() prices = set(cache) with Timer("Get pricing from AWS"): for instance_type in self.settings.utility.keys(): for zone in zones: if cache: most_recent = most_recents[{ "instance_type": instance_type, "availability_zone": zone }].timestamp start_at = MAX([Date(most_recent), Date.today() - WEEK]) else: start_at = Date.today() - WEEK if DEBUG_PRICING: Log.note("get pricing for {{instance_type}} starting at {{start_at}}", instance_type=instance_type, start_at=start_at ) next_token = None while True: resultset = self.ec2_conn.get_spot_price_history( product_description=coalesce(self.settings.product, "Linux/UNIX (Amazon VPC)"), instance_type=instance_type, availability_zone=zone, start_time=start_at.format(ISO8601), next_token=next_token ) next_token = resultset.next_token for p in resultset: prices.add(wrap({ "availability_zone": p.availability_zone, "instance_type": p.instance_type, "price": p.price, "product_description": p.product_description, "region": p.region.name, "timestamp": Date(p.timestamp).unix })) if not next_token: break with Timer("Save prices to file"): new_prices = jx.filter(prices, {"gte": {"timestamp": {"date": "today-2day"}}}) def stream(): # IT'S A LOT OF PRICES, STREAM THEM TO FILE prefix = "[\n" for p in new_prices: yield prefix yield convert.value2json(p) prefix = ",\n" yield "]" File(self.settings.price_file).write(stream()) return prices
def fix(rownum, line, source, sample_only_filter, sample_size): # ES SCHEMA IS STRICTLY TYPED, USE "code" FOR TEXT IDS line = line.replace('{"id": "bb"}', '{"code": "bb"}').replace('{"id": "tc"}', '{"code": "tc"}') # ES SCHEMA IS STRICTLY TYPED, THE SUITE OBJECT CAN NOT BE HANDLED if source.name.startswith("active-data-test-result"): # "suite": {"flavor": "plain-chunked", "name": "mochitest"} found = strings.between(line, '"suite": {', '}') if found: suite_json = '{' + found + "}" if suite_json: suite = convert.json2value(suite_json) suite = convert.value2json(coalesce(suite.fullname, suite.name)) line = line.replace(suite_json, suite) if rownum == 0: value = convert.json2value(line) if len(line) > MAX_RECORD_LENGTH: _shorten(value, source) _id, value = _fix(value) row = {"id": _id, "value": value} if sample_only_filter and Random.int(int(1.0/coalesce(sample_size, 0.01))) != 0 and jx.filter([value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") return row, True elif len(line) > MAX_RECORD_LENGTH: value = convert.json2value(line) _shorten(value, source) _id, value = _fix(value) row = {"id": _id, "value": value} elif line.find('"resource_usage":') != -1: value = convert.json2value(line) _id, value = _fix(value) row = {"id": _id, "value": value} else: # FAST _id = strings.between(line, "\"_id\": \"", "\"") # AVOID DECODING JSON row = {"id": _id, "json": line} return row, False