def write_lines(self, key, *lines): self._verify_key_format(key) storage = self.bucket.new_key(key + ".json.gz") buff = BytesIO() archive = gzip.GzipFile(fileobj=buff, mode='w') count = 0 for l in lines: if hasattr(l, "__iter__"): for ll in l: archive.write(ll.encode("utf8")) archive.write(b"\n") count += 1 else: archive.write(l.encode("utf8")) archive.write(b"\n") count += 1 archive.close() file_length = buff.tell() buff.seek(0) with Timer("Sending {{count}} lines in {{file_length|comma}} bytes", { "file_length": file_length, "count": count }, debug=self.settings.debug): storage.set_contents_from_file(buff) if self.settings.public: storage.set_acl('public-read') return
def get_raw_json(path): active_data_timer = Timer("total duration") body = flask.request.get_data() try: with active_data_timer: args = wrap(Dict(**flask.request.args)) limit = args.limit if args.limit else 10 args.limit = None frum = wrap_from(path) result = jx.run( { "from": path, "where": { "eq": args }, "limit": limit, "format": "list" }, frum) if isinstance( result, Container ): #TODO: REMOVE THIS CHECK, jx SHOULD ALWAYS RETURN Containers result = result.format("list") result.meta.active_data_response_time = active_data_timer.duration response_data = convert.unicode2utf8( convert.value2json(result.data, pretty=True)) Log.note("Response is {{num}} bytes", num=len(response_data)) return Response(response_data, status=200) except Exception, e: e = Except.wrap(e) return _send_error(active_data_timer, body, e)
def get_treeherder_job(self): try: with Timer("Process Request"): args = Dict(**flask.request.args) # IS THE branch/revision PENDING? result = self.get_markup(unwraplist(args.branch), unwraplist(args.revision), unwraplist(args.task_id), unwraplist(args.buildername), unwraplist(args.timestamp)) response_data = convert.unicode2utf8( convert.value2json(result)) return Response(response_data, status=200, headers={ "access-control-allow-origin": "*", "content-type": "text/plain" }) except Exception, e: e = Except.wrap(e) Log.warning("Could not process", cause=e) e = e.as_dict() return Response(convert.unicode2utf8(convert.value2json(e)), status=400, headers={ "access-control-allow-origin": "*", "content-type": "application/json" })
def _worker(self, please_stop): if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(':memory:') try: while not please_stop: if DEBUG: Log.note("begin pop") command, result, signal, trace = self.queue.pop() if DEBUG: Log.note("done pop") if DEBUG: Log.note("Running command\n{{command|indent}}", command=command) with Timer("Run command", debug=DEBUG): if signal is not None: try: curr = self.db.execute(command) result.meta.format = "table" result.data = curr.fetchall() except Exception, e: e = Except.wrap(e) result.exception = Except( ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally: signal.go()
def test_multiple_agg_on_same_field(self): if self.not_real_service(): return test = wrap({ "query": { "from": { "type": "elasticsearch", "settings": { "host": ES_CLUSTER_LOCATION, "index": "unittest", "type": "test_result" } }, "select": [{ "name": "max_bytes", "value": "run.stats.bytes", "aggregate": "max" }, { "name": "count", "value": "run.stats.bytes", "aggregate": "count" }] } }) query = convert.unicode2utf8(convert.value2json(test.query)) # EXECUTE QUERY with Timer("query"): response = http.get(self.service_url, data=query) if response.status_code != 200: error(response) result = convert.json2value(convert.utf82unicode(response.all_content)) Log.note("result\n{{result|indent}}", {"result": result})
def verify_blobber_file(line_number, name, url): """ :param line_number: for debugging :param name: for debugging :param url: TO BE READ :return: RETURNS BYTES **NOT** UNICODE """ if name in ["emulator-5554.log", "qemu.log"] or any( map(name.endswith, [".png", ".html"])): return None, 0 with Timer("Read {{name}}: {{url}}", { "name": name, "url": url }, debug=DEBUG): response = http.get(url) try: logs = response.all_lines except Exception, e: if name.endswith("_raw.log"): Log.error( "Line {{line}}: {{name}} = {{url}} is NOT structured log", line=line_number, name=name, url=url, cause=e) if DEBUG: Log.note( "Line {{line}}: {{name}} = {{url}} is NOT structured log", line=line_number, name=name, url=url) return None, 0
def extend(self, values): records = [] for v in wrap(values): row = {"_id": v.id} for k, vv in v.value.leaves(): row[k] = vv records.append(row) with Timer("Push {{num}} records to Redshift", {"num": len(records)}): self.db.insert_list(self.settings.table, records)
def test_timing(self): if self.not_real_service(): return test = wrap({ "query": { "from": { "type": "elasticsearch", "settings": { "host": ES_CLUSTER_LOCATION, "index": "unittest", "type": "test_result" } }, "select": [{ "name": "count", "value": "run.duration", "aggregate": "count" }, { "name": "total", "value": "run.duration", "aggregate": "sum" }], "edges": [{ "name": "chunk", "value": ["run.suite", "run.chunk"] }, "result.ok"], "where": { "and": [{ "lt": { "timestamp": Date.floor(Date.now()).milli / 1000 } }, { "gte": { "timestamp": Date.floor(Date.now() - (Duration.DAY * 7), Duration.DAY).milli / 1000 } }] }, "format": "cube", "samples": { "limit": 30 } } }) query = convert.unicode2utf8(convert.value2json(test.query)) # EXECUTE QUERY with Timer("query"): response = http.get(self.service_url, data=query) if response.status_code != 200: error(response) result = convert.json2value(convert.utf82unicode(response.all_content)) Log.note("result\n{{result|indent}}", {"result": result})
def test_simple_query(self): if self.not_real_service(): return query = convert.unicode2utf8(convert.value2json({"from": "unittest"})) # EXECUTE QUERY with Timer("query"): response = http.get(self.service_url, data=query) if response.status_code != 200: error(response) result = convert.json2value(convert.utf82unicode(response.all_content)) Log.note("result\n{{result|indent}}", {"result": result})
def find_largest_key(self): """ FIND LARGEST VERSION NUMBER (with dots (.) and colons(:)) IN THE KEYS OF AN S3 BUCKET. """ with Timer("Full scan of {{bucket}} for max key", {"bucket": self.bucket.name}): maxi = 0 for k in self.bucket.bucket.list(delimiter=":"): try: v = key_prefix(k.name) maxi = max(maxi, v) except Exception, e: self.bucket.bucket.delete_key(k.name) return maxi
def list_s3(settings, filter): """ LIST THE KEYS AND TIMESTAMPS FOUND IN AN S3 BUCKET """ with Timer("get all metadata"): metas = Bucket(settings).metas() filtered = qb.run({ "from": metas, "where": filter, "sort": "last_modified" }) for meta in filtered: Log.note("Read {{key}} {{timestamp}}", key=meta.key, timestamp=meta.last_modified)
def process_unittest(source_key, etl_header, buildbot_summary, unittest_log, destination, please_stop=None): timer = Timer("Process log {{file}} for {{key}}", { "file": etl_header.name, "key": source_key }) try: with timer: summary = accumulate_logs(source_key, etl_header.name, unittest_log, please_stop) except Exception, e: Log.error("Problem processing {{key}}", key=source_key, cause=e) summary = None
def test_longest_running_tests(self): test = wrap({ "query": { "sort": { "sort": -1, "field": "avg" }, "from": { "from": "unittest", "where": { "and": [{ "gt": { "build.date": "1439337600" } }] }, "groupby": [ "build.platform", "build.type", "run.suite", "result.test" ], "select": [{ "aggregate": "avg", "name": "avg", "value": "result.duration" }], "format": "table", "limit": 100 }, "limit": 100, "format": "list" } }) query = convert.unicode2utf8(convert.value2json(test.query)) # EXECUTE QUERY with Timer("query"): response = http.get(self.service_url, data=query) if response.status_code != 200: error(response) result = convert.json2value(convert.utf82unicode(response.all_content)) Log.note("result\n{{result|indent}}", {"result": result})
def copy(self, keys, source, sample_only_filter=None, sample_size=None, done_copy=None): """ :param keys: THE KEYS TO LOAD FROM source :param source: THE SOURCE (USUALLY S3 BUCKET) :param sample_only_filter: SOME FILTER, IN CASE YOU DO NOT WANT TO SEND EVERYTHING :param sample_size: FOR RANDOM SAMPLE OF THE source DATA :param done_copy: CALLBACK, ADDED TO queue, TO FINISH THE TRANSACTION :return: LIST OF SUB-keys PUSHED INTO ES """ num_keys = 0 queue = None for key in keys: timer = Timer("key") try: with timer: for rownum, line in enumerate( source.read_lines(strip_extension(key))): if not line: continue row, please_stop = fix(rownum, line, source, sample_only_filter, sample_size) num_keys += 1 if queue == None: queue = self._get_queue(row) queue.add(row) if please_stop: break except Exception, e: done_copy = None Log.warning( "Could not process {{key}} after {{duration|round(places=2)}}seconds", key=key, duration=timer.duration.seconds, cause=e)
def copy2es(es, settings, work_queue, please_stop=None): # EVERYTHING FROM ELASTICSEARCH bucket = s3.Bucket(settings.source) for key in iter(work_queue.pop, ""): if please_stop: return if key == None: continue key = unicode(key) extend_time = Timer("insert", silent=True) Log.note("Indexing {{key}}", key=key) with extend_time: if settings.sample_only: sample_filter = { "terms": { "build.branch": settings.sample_only } } elif settings.sample_size: sample_filter = True else: sample_filter = None if key.find(":") >= 0: more_keys = bucket.keys(prefix=key) else: more_keys = bucket.keys(prefix=key + ":") num_keys = es.copy(more_keys, bucket, sample_filter, settings.sample_size) if num_keys > 1: Log.note( "Added {{num}} keys from {{key}} block in {{duration}} ({{rate|round(places=3)}} keys/second)", num=num_keys, key=key, duration=extend_time.duration, rate=num_keys / Math.max(extend_time.duration.seconds, 0.01)) work_queue.commit()
def test_branch_count(self): if self.not_real_service(): return test = wrap({ "query": { "from": { "type": "elasticsearch", "settings": { "host": ES_CLUSTER_LOCATION, "index": "unittest", "type": "test_result" } }, "select": [ { "aggregate": "count" }, ], "edges": ["build.branch"], "where": { "or": [{ "missing": "build.id" } # {"gte": {"timestamp": Date.floor(Date.now() - (Duration.DAY * 7), Duration.DAY).milli / 1000}} ] }, "format": "table" } }) query = convert.unicode2utf8(convert.value2json(test.query)) # EXECUTE QUERY with Timer("query"): response = http.get(self.service_url, data=query) if response.status_code != 200: error(response) result = convert.json2value(convert.utf82unicode(response.all_content)) Log.note("result\n{{result|indent}}", {"result": result})
def test_51586(self): debug_settings = { "trace": True, "cprofile": { "enabled": True, "filename": "tests/results/test_51586_profile.tab" } } Log.start(debug_settings) source_key = "51586_5124145.52" content = File("tests/resources/51586_5124145.52.json.gz").read_bytes() source = Dict(read_lines=lambda: GzipLines(content)) with Accumulator( File("tests/results/51586_5124145.52.json")) as destination: with Timer("ETL file"): process_unittest_in_s3(source_key, source, destination, please_stop=None) Log.stop()
def extract_rows(es, es_query, source, select, query): with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits for i, s in enumerate(select.copy()): # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if s.value == "*": try: column_names = set(c.name for c in query.frum.get_columns() if (c.type not in ["object"] or c.useSource) and not c.depth) except Exception, e: Log.warning("can not get columns", e) column_names = UNION(*[[k for k, v in row.items()] for row in T.select(source)]) column_names -= set(select.name) select = select[:i:] + [{ "name": n, "value": n } for n in column_names] + select[i + 1::] break
def write_lines(self, key, lines): self._verify_key_format(key) storage = self.bucket.new_key(key + ".json.gz") buff = TemporaryFile() archive = gzip.GzipFile(fileobj=buff, mode='w') count = 0 for l in lines: if hasattr(l, "__iter__"): for ll in l: archive.write(ll.encode("utf8")) archive.write(b"\n") count += 1 else: archive.write(l.encode("utf8")) archive.write(b"\n") count += 1 archive.close() file_length = buff.tell() retry = 3 while retry: try: with Timer( "Sending {{count}} lines in {{file_length|comma}} bytes", { "file_length": file_length, "count": count }, debug=self.settings.debug): buff.seek(0) storage.set_contents_from_file(buff) break except Exception, e: Log.warning("could not push data to s3", cause=e) retry -= 1
def test_failures_by_directory(self): if self.not_real_service(): return test = wrap({ "query": { "from": { "type": "elasticsearch", "settings": { "host": ES_CLUSTER_LOCATION, "index": "unittest", "type": "test_result" } }, "select": [{ "aggregate": "count" }], "edges": ["result.test", "result.ok"], "where": { "prefix": { "result.test": "/" } }, "format": "table" } }) query = convert.unicode2utf8(convert.value2json(test.query)) # EXECUTE QUERY with Timer("query"): response = http.get(self.service_url, data=query) if response.status_code != 200: error(response) result = convert.json2value(convert.utf82unicode(response.all_content)) Log.note("result\n{{result|indent}}", {"result": result})
def pull_repo(repo): if not File(os.path.join(repo.directory, ".hg")).exists: File(repo.directory).delete() # REPO DOES NOT EXIST, CLONE IT with Timer("Clone hg log for {{name}}", {"name": repo.name}): proc = subprocess.Popen( ["hg", "clone", repo.url, File(repo.directory).filename], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=-1) try: while True: line = proc.stdout.readline() if line.startswith("abort:"): Log.error( "Can not clone {{repos.url}}, because {{problem}}", { "repos": repo, "problem": line }) if line == '': break Log.note("Mercurial cloning: {{status}}", {"status": line}) finally: proc.wait() else: hgrc_file = File(os.path.join(repo.directory, ".hg", "hgrc")) if not hgrc_file.exists: hgrc_file.write("[paths]\ndefault = " + repo.url + "\n") # REPO EXISTS, PULL TO UPDATE with Timer("Pull hg log for {{name}}", {"name": repo.name}): proc = subprocess.Popen( ["hg", "pull", "--cwd", File(repo.directory).filename], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=-1) (output, _) = proc.communicate() if output.find("abort: repository default not found!") >= 0: File(repo.directory).delete() pull_repo(repo) return if output.find("abort: abandoned transaction found") >= 0: Log.error( "Problem pulling repos, try \"hg recover\"\n{{reason|indent}}", {"reason": output}) File(repo.directory).delete() pull_repo(repo) return if output.find("abort: ") >= 0: Log.error("Problem with pull {{reason}}", {"reason": between(output, "abort:", "\n")}) Log.note("Mercurial pull results:\n{{pull_results}}", {"pull_results": output})
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) select = wrap([s.copy() for s in listwrap(query.select)]) new_select = DictList() columns = query.frum.get_columns() leaf_columns = set(c.name for c in columns if c.type not in ["object", "nested"] and (not c.nested_path or c.es_column == c.nested_path)) nested_columns = set(c.name for c in columns if c.nested_path) i = 0 source = "fields" for s in select: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(s.value, LeavesOp): if isinstance(s.value.term, Variable): if s.value.term.var == ".": es_query.fields = None source = "_source" net_columns = leaf_columns - set(select.name) for n in net_columns: new_select.append({ "name": n, "value": n, "put": {"name": n, "index": i, "child": "."} }) i += 1 else: parent = s.value.var + "." prefix = len(parent) for c in leaf_columns: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": s.name + "." + c[prefix:], "value": c, "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value.var, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value.var, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var in nested_columns: es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 else: parent = s.value.var + "." prefix = len(parent) net_columns = [c for c in leaf_columns if c.startswith(parent)] if not net_columns: # LEAF if es_query.fields is not None: es_query.fields.append(s.value.var) new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) else: # LEAVES OF OBJECT for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": s.name, "value": n, "put": {"name": s.name, "index": i, "child": n[prefix:]} }) i += 1 else: es_query.script_fields[literal_field(s.name)] = {"script": s.value.to_ruby()} new_select.append({ "name": s.name, "pull": "fields." + literal_field(s.name), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value)) else: n.pull = "fields." + literal_field(n.value) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def es_aggsop(es, frum, query): select = wrap([s.copy() for s in listwrap(query.select)]) es_column_map = {c.name: unwraplist(c.es_column) for c in frum.schema.all_columns} es_query = Dict() new_select = Dict() #MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": s.pull = "doc_count" elif isinstance(s.value, Variable): if s.value.var == ".": if frum.typed: # STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING if s.aggregate in NON_STATISTICAL_AGGS: #TODO: HANDLE BOTH $value AND $objects TO COUNT Log.error("do not know how to handle") else: s.value.var = "$value" new_select["$value"] += [s] else: if s.aggregate in NON_STATISTICAL_AGGS: #TODO: WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT Log.error("do not know how to handle") else: Log.error('Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate) elif s.aggregate == "count": s.value = s.value.map(es_column_map) new_select["count_"+literal_field(s.value.var)] += [s] else: s.value = s.value.map(es_column_map) new_select[literal_field(s.value.var)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): representative = many[0] if representative.value.var == ".": Log.error("do not know how to handle") else: field_name = representative.value.var # canonical_name=literal_field(many[0].name) for s in many: if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.field = field_name s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, basestring) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = literal_field(canonical_name + " cardinality") es_query.aggs[key].cardinality.field = field_name s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = field_name # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.field = field_name es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.field = field_name es_query.aggs[stats_name].terms.size = Math.min(s.limit, MAX_LIMIT) s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = field_name s.pull = literal_field(canonical_name) + "." + aggregates1_4[s.aggregate] for i, s in enumerate(formula): canonical_name = literal_field(s.name) abs_value = s.value.map(es_column_map) if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = abs_value.to_ruby() s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = abs_value.to_ruby() s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = abs_value.to_ruby() # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = abs_value.to_ruby() es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate=="union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = abs_value.to_ruby() s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = canonical_name + "." + aggregates1_4[s.aggregate] es_query.aggs[canonical_name].extended_stats.script = abs_value.to_ruby() decoders = get_decoders_by_depth(query) start = 0 vars_ = query.where.vars() #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum, map_=es_column_map) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = simplify_esfilter(AndOp("and", split_where[1]).to_esfilter()) es_query = Dict( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( { "nested": { "path": frum.query_path } }, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") for d in decoders[0]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = simplify_esfilter(AndOp("and", split_where[0]).to_esfilter()) es_query = Dict( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)
def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column. startswith("previous_values.cf_") and not r.es_index.startswith( "debug")) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() if query_path: c.table = c.es_index + "." + query_path.last() else: c.table = c.es_index with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) if query_path: c.table = alias + "." + query_path.last() else: c.table = alias self._upsert_column(c) # EACH query_path IS A LIST OF EVER-INCREASING PATHS THROUGH EACH NESTED LEVEL query_paths = wrap([[c.es_column] for c in abs_columns if c.type == "nested"]) for a, b in itertools.product(query_paths, query_paths): aa = a.last() bb = b.last() if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) < len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(0, aa) break query_paths.append([]) for c in abs_columns: # ADD RELATIVE COLUMNS full_path = listwrap(c.nested_path) abs_depth = len(full_path) abs_parent = coalesce(full_path.last(), "") for query_path in query_paths: rel_depth = len(query_path) # ABSOLUTE add_column(copy(c), query_path) cc = copy(c) cc.relative = True if not query_path: add_column(cc, query_path) continue rel_parent = query_path.last() if c.es_column.startswith(rel_parent + "."): cc.name = c.es_column[len(rel_parent) + 1:] add_column(cc, query_path) elif c.es_column == rel_parent: cc.name = "." add_column(cc, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent.startswith(abs_parent + "."): cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error")
def _get_job_results_from_th(self, branch, revision): output = [] with self.locker: waiting_threads = self.pending.get((branch, revision)) if waiting_threads is None: sig = None waiting_threads = self.pending[(branch, revision)] = [output] else: sig = Signal() waiting_threads.append(Signal()) if sig is not None: Log.note("Holding thread for {{branch}}/{{revision}}", branch=branch, revision=revision) sig.wait_for_go() return waiting_threads[0] try: results = DictList() while True: response = self._rate_limited_get_json( expand_template(RESULT_SET_URL, { "branch": branch, "revision": revision[0:12:] })) results.extend(response.results) if len(response.results) != 1000: break for g, repo_ids in jx.groupby(results.id, size=10): jobs = DictList() with Timer("Get {{num}} jobs", {"num": len(repo_ids)}, debug=DEBUG): while True: response = self._rate_limited_get_json( expand_template( JOBS_URL, { "branch": branch, "offset": len(jobs), "result_set_id": ",".join(map(unicode, repo_ids)) })) jobs.extend(response.results) if len(response.results) != 2000: break with Timer("Get (up to {{num}}) details from TH", {"num": len(jobs)}, debug=DEBUG): details = [] for _, ids in jx.groupby(jobs.id, size=40): details.extend( self._rate_limited_get_json(url=expand_template( DETAILS_URL, { "branch": branch, "job_id": ",".join(map(unicode, ids)) }), retry={ "times": 3 }).results) details = { k.job_guid: list(v) for k, v in jx.groupby(details, "job_guid") } with Timer("Get (up to {{num}}) stars from TH", {"num": len(jobs)}, debug=DEBUG): stars = [] for _, ids in jx.groupby(jobs.id, size=40): response = self._rate_limited_get_json( expand_template( JOB_BUG_MAP, { "branch": branch, "job_id": "&job_id=".join(map( unicode, ids)) })) stars.extend(response), stars = { k.job_id: list(v) for k, v in jx.groupby(stars, "job_id") } with Timer("Get notes from TH", debug=DEBUG): notes = [] for jid in set([ j.id for j in jobs if j.failure_classification_id != 1 ] + stars.keys()): response = self._rate_limited_get_json( expand_template(NOTES_URL, { "branch": branch, "job_id": unicode(jid) })) notes.extend(response), notes = { k.job_id: list(v) for k, v in jx.groupby(notes, "job_id") } for j in jobs: output.append( self._normalize_job_result(branch, revision, j, details, notes, stars)) if output: with Timer("Write to ES cache", debug=DEBUG): self.cache.extend( { "id": "-".join([c.repo.branch, unicode(c.job.id)]), "value": c } for c in output) try: self.cache.flush() except Exception, e: Log.warning("problem flushing. nevermind.", cause=e) finally: with self.locker: for p in waiting_threads[1:]: if DEBUG: Log.note( "releasing thread for {{branch}}/{{revision}}", branch=branch, revision=revision) p.go() self.pending[(branch, revision)] = None return output
def main(): """ CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE """ try: settings = startup.read_settings(defs=[{ "name": ["--bucket"], "help": "bucket to reprocess", "type": str, "dest": "bucket", "required": True }, { "name": ["--begin", "--start"], "help": "lowest key (or prefix) to reprocess", "type": str, "dest": "start", "default": "1", "required": False }, { "name": ["--end", "--stop"], "help": "highest key (or prefix) to reprocess", "type": str, "dest": "end", "default": None, "required": False }, { "name": ["--file"], "help": "path to file with CR-delimited prefix list", "type": str, "dest": "file", "default": None, "required": False }]) Log.start(settings.debug) with aws.Queue(settings.work_queue) as work_queue: source = Connection(settings.aws).get_bucket(settings.args.bucket) if settings.args.file: now = Date.now() for prefix in File(settings.args.file): all_keys = source.keys(prefix=key_prefix(prefix)) for k in all_keys: Log.note("Adding {{key}}", key=k) work_queue.add({ "bucket": settings.args.bucket, "key": k, "timestamp": now.unix, "date/time": now.format() }) return if settings.args.end and settings.args.start: up_to = str(int(settings.args.end) - 1) prefix = strings.common_prefix(settings.args.start, up_to) else: prefix = None start = Version(settings.args.start) end = Version(settings.args.end) all_keys = source.keys(prefix=prefix) with Timer("filtering {{num}} keys", {"num": len(all_keys)}): all_keys = [(k, Version(k)) for k in all_keys if k.find("None") == -1] all_keys = [(k, p) for k, p in all_keys if start <= p < end] with Timer("sorting {{num}} keys", {"num": len(all_keys)}): all_keys = qb.sort(all_keys, 1) for k, p in all_keys: Log.note("Adding {{key}}", key=k) now = Date.now() work_queue.add({ "bucket": settings.args.bucket, "key": k, "timestamp": now.unix, "date/time": now.format() }) except Exception, e: Log.error("Problem with etl", e)
def process_pulse_block(source_key, source, destination, please_stop=None): """ SIMPLE CONVERT pulse_block INTO S3 LOGFILES PREPEND WITH ETL HEADER AND PULSE ENVELOPE """ output = [] stats = Dict() etl_header_gen = EtlHeadGenerator(source_key) for i, line in enumerate(source.read_lines()): if please_stop: Log.error("Stopping early") pulse_record = scrub_pulse_record(source_key, i, line, stats) if not pulse_record: continue if DEBUG or DEBUG_SHOW_LINE: Log.note( "Source {{key}}, line {{line}}, buildid = {{buildid|quote}}", key=source_key, line=i, buildid=pulse_record.payload.builddate) file_num = 0 for name, url in pulse_record.payload.blobber_files.items(): try: if url == None: if DEBUG: Log.note( "Line {{line}}: found structured log with NULL url", line=i) continue log_content, num_lines = verify_blobber_file(i, name, url) if not log_content: continue with Timer( "Copied {{line}}, {{name}} with {{num_lines}} lines", { "line": i, "name": name, "num_lines": num_lines }, debug=DEBUG): dest_key, dest_etl = etl_header_gen.next( pulse_record.payload.etl, name) destination.write_lines( dest_key, convert.value2json(dest_etl), # ETL HEADER line, # PULSE MESSAGE log_content) file_num += 1 output.append(dest_key) if DEBUG_SHOW_LINE: Log.note("Copied {{key}}: {{url}}", key=dest_key, url=url) except Exception, e: Log.error("Problem processing {{name}} = {{url}}", name=name, url=url, cause=e) if not file_num and DEBUG_SHOW_NO_LOG: Log.note("No structured log {{json}}", json=pulse_record.payload)
url=url, cause=e) if DEBUG: Log.note( "Line {{line}}: {{name}} = {{url}} is NOT structured log", line=line_number, name=name, url=url) return None, 0 if any(name.endswith(e) for e in STRUCTURED_LOG_ENDINGS): # FAST TRACK THE FILES WE SUSPECT TO BE STRUCTURED LOGS ALREADY return logs, "unknown" # DETECT IF THIS IS A STRUCTURED LOG with Timer("Structured log detection {{name}}:", {"name": name}, debug=DEBUG): try: total = 0 # ENSURE WE HAVE A SIDE EFFECT count = 0 bad = 0 for blobber_line in logs: blobber_line = strings.strip(blobber_line) if not blobber_line: continue try: total += len(convert.json2value(blobber_line)) count += 1 except Exception, e: if DEBUG: Log.note("Not JSON: {{line}}",
def __init__(self, dim, parent, qb): self.name = dim.name self.parent = parent self.full_name = join_field( split_field(self.parent.full_name) + [self.name]) dot.set_default(self, dim) self.esfilter = dim.esfilter self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, qb.es.settings.name) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Dict() for e in listwrap(dim.edges): new_e = Dimension(e, self, qb) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif isinstance(fields, Mapping): self.fields = wrap(fields) edges = wrap([{ "name": k, "value": v, "allowNulls": False } for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{ "name": f, "value": f, "index": i, "allowNulls": False } for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if dim.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH with Timer("Get parts of {{name}}", {"name": self.name}): parts = qb.query({ "from": self.index, "select": { "name": "count", "aggregate": "count" }, "edges": edges, "esfilter": self.esfilter, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name=self.name, num=len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Dict(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not isinstance(a, list): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts(temp, dim.path(d.getEnd(d.partitions[i])), count, 0) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif isinstance(fields, Mapping): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = DictList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "esfilter": { "and": [{ "term": { e.value: g[e.name] } } for e in edges] }, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "esfilter": { "term": { edges[0].value: d.partitions[i].value } }, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values( )[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if isinstance(fields, Mapping): output = Dict() for e, v in zip(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "esfilter": { "term": { edges[0].value: d.partitions[i].value } }, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "esfilter": { "and": [{ "term": { edges[0].value: d.partitions[i].value } }, { "term": { edges[1].value: d2.partitions[j].value } }] }, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
class Sqlite(object): """ Allows multi-threaded access Loads extension functions (like SQRT) """ canonical = None def __init__(self, db=None): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread save database """ if not _upgraded: _upgrade() self.db = None self.queue = Queue( "sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG def execute(self, command): """ COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS :param command: COMMAND FOR SQLITE :return: None """ if self.get_trace: trace = extract_stack(1) else: trace = None self.queue.add((command, None, None, trace)) def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ signal = Signal() result = Dict() self.queue.add((command, result, signal, None)) signal.wait_for_go() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def _worker(self, please_stop): if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(':memory:') try: full_path = File( "pyLibrary/vendor/sqlite/libsqlitefunctions.so").abspath # self.db.execute("SELECT sqlite3_enable_load_extension(1)") self.db.enable_load_extension(True) self.db.execute("SELECT load_extension('" + full_path + "')") except Exception, e: Log.warning( "loading sqlite extension functions failed, doing without. (no SQRT for you!)", cause=e) try: while not please_stop: if DEBUG: Log.note("begin pop") command, result, signal, trace = self.queue.pop() if DEBUG: Log.note("done pop") if DEBUG: Log.note("Running command\n{{command|indent}}", command=command) with Timer("Run command", debug=DEBUG): if signal is not None: try: curr = self.db.execute(command) result.meta.format = "table" result.header = [d[0] for d in curr.description ] if curr.description else None result.data = curr.fetchall() except Exception, e: e = Except.wrap(e) result.exception = Except( ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally: