def get_decoders_by_depth(query): """ RETURN A LIST OF DECODER ARRAYS, ONE ARRAY FOR EACH NESTED DEPTH """ schema = query.frum output = DictList() for e in wrap(coalesce(query.edges, query.groupby, [])): if e.value != None and not isinstance(e.value, NullOp): e = e.copy() vars_ = e.value.vars() for v in vars_: if not schema[v]: Log.error("{{var}} does not exist in schema", var=v) e.value = e.value.map({schema[v].name: schema[v].es_column for v in vars_}) elif e.range: e = e.copy() min_ = e.range.min max_ = e.range.max vars_ = min_.vars() | max_.vars() for v in vars_: if not schema[v]: Log.error("{{var}} does not exist in schema", var=v) map_ = {schema[v].name: schema[v].es_column for v in vars_} e.range = { "min": min_.map(map_), "max": max_.map(map_) } elif e.domain.dimension: vars_ = e.domain.dimension.fields e.domain.dimension = e.domain.dimension.copy() e.domain.dimension.fields = [schema[v].es_column for v in vars_] elif all(e.domain.partitions.where): vars_ = set() for p in e.domain.partitions: vars_ |= p.where.vars() try: depths = set(len(schema[v].nested_path)-1 for v in vars_) if -1 in depths: Log.error( "Do not know of column {{column}}", column=unwraplist([v for v in vars_ if schema[v]==None]) ) if len(depths) > 1: Log.error("expression {{expr}} spans tables, can not handle", expr=e.value) max_depth = Math.MAX(depths) while len(output) <= max_depth: output.append([]) except Exception, e: # USUALLY THE SCHEMA IS EMPTY, SO WE ASSUME THIS IS A SIMPLE QUERY max_depth = 0 output.append([]) limit = 0 output[max_depth].append(AggsDecoder(e, query, limit))
def dictwrap(v): type_ = _get(v, "__class__") if type_ is dict: m = Dict() _set(m, "_dict", v) # INJECT m.__dict__=v SO THERE IS NO COPY return m elif type_ is NoneType: return None # So we allow `is None` elif type_ is list: return DictList(v) elif type_ is GeneratorType: return (wrap(vv) for vv in v) elif isinstance(v, (basestring, int, float, Decimal, datetime, date, Dict, DictList, NullType, NoneType)): return v else: return DictObject(v)
def _get_job_results_from_th(self, branch, revision): output = [] with self.locker: waiting_threads = self.pending.get((branch, revision)) if waiting_threads is None: sig = None waiting_threads = self.pending[(branch, revision)] = [output] else: sig = Signal() waiting_threads.append(Signal()) if sig is not None: Log.note("Holding thread for {{branch}}/{{revision}}", branch=branch, revision=revision) sig.wait_for_go() return waiting_threads[0] try: results = DictList() while True: response = self._rate_limited_get_json( expand_template(RESULT_SET_URL, { "branch": branch, "revision": revision[0:12:] })) results.extend(response.results) if len(response.results) != 1000: break for g, repo_ids in jx.groupby(results.id, size=10): jobs = DictList() with Timer("Get {{num}} jobs", {"num": len(repo_ids)}, debug=DEBUG): while True: response = self._rate_limited_get_json( expand_template( JOBS_URL, { "branch": branch, "offset": len(jobs), "result_set_id": ",".join(map(unicode, repo_ids)) })) jobs.extend(response.results) if len(response.results) != 2000: break with Timer("Get (up to {{num}}) details from TH", {"num": len(jobs)}, debug=DEBUG): details = [] for _, ids in jx.groupby(jobs.id, size=40): details.extend( self._rate_limited_get_json(url=expand_template( DETAILS_URL, { "branch": branch, "job_id": ",".join(map(unicode, ids)) }), retry={ "times": 3 }).results) details = { k.job_guid: list(v) for k, v in jx.groupby(details, "job_guid") } with Timer("Get (up to {{num}}) stars from TH", {"num": len(jobs)}, debug=DEBUG): stars = [] for _, ids in jx.groupby(jobs.id, size=40): response = self._rate_limited_get_json( expand_template( JOB_BUG_MAP, { "branch": branch, "job_id": "&job_id=".join(map( unicode, ids)) })) stars.extend(response), stars = { k.job_id: list(v) for k, v in jx.groupby(stars, "job_id") } with Timer("Get notes from TH", debug=DEBUG): notes = [] for jid in set([ j.id for j in jobs if j.failure_classification_id != 1 ] + stars.keys()): response = self._rate_limited_get_json( expand_template(NOTES_URL, { "branch": branch, "job_id": unicode(jid) })) notes.extend(response), notes = { k.job_id: list(v) for k, v in jx.groupby(notes, "job_id") } for j in jobs: output.append( self._normalize_job_result(branch, revision, j, details, notes, stars)) if output: with Timer("Write to ES cache", debug=DEBUG): self.cache.extend( { "id": "-".join([c.repo.branch, unicode(c.job.id)]), "value": c } for c in output) try: self.cache.flush() except Exception, e: Log.warning("problem flushing. nevermind.", cause=e) finally: with self.locker: for p in waiting_threads[1:]: if DEBUG: Log.note( "releasing thread for {{branch}}/{{revision}}", branch=branch, revision=revision) p.go() self.pending[(branch, revision)] = None return output
def es_deepop(es, query): columns = query.frum.get_columns(query.frum.name) query_path = query.frum.query_path columns = UniqueIndex(keys=["name"], data=sorted( columns, lambda a, b: cmp(len(listwrap(b.nested_path)), len(listwrap(a.nested_path)))), fail_on_dup=False) map_to_es_columns = {c.name: c.es_column for c in columns} map_to_local = { c.name: "_inner" + c.es_column[len(listwrap(c.nested_path)[0]):] if c.nested_path else "fields." + literal_field(c.es_column) for c in columns } # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es14.util.es_query_template(query.frum.name) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, query.frum, map_to_es_columns) for i, f in enumerate(es_filters): # PROBLEM IS {"match_all": {}} DOES NOT SURVIVE set_default() for k, v in unwrap( simplify_esfilter(AndOp("and", wheres[i]).to_esfilter())).items(): f[k] = v if not wheres[1]: more_filter = { "and": [ simplify_esfilter(AndOp("and", wheres[0]).to_esfilter()), { "not": { "nested": { "path": query_path, "filter": { "match_all": {} } } } } ] } else: more_filter = None es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = [] is_list = isinstance(query.select, list) new_select = DictList() def get_pull(column): if column.nested_path: return "_inner" + column.es_column[ len(listwrap(column.nested_path)[0]):] else: return "fields." + literal_field(column.es_column) i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp): if isinstance(s.value.term, Variable): if s.value.term.var == ".": # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": { "name": literal_field(c.name), "index": i, "child": "." } }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS col_names = [c.name for c in columns if c.relative] for n in new_select: if n.name.startswith("..") and n.name.lstrip( ".") not in col_names: n.name = n.put.name = n.name.lstrip(".") else: column = s.term.value.var + "." prefix = len(column) for c in columns: if c.name.startswith(column) and c.type not in [ "object", "nested" ]: pull = get_pull(c) if len(listwrap(c.nested_path)) == 0: es_query.fields += [c.es_column] new_select.append({ "name": s.name + "." + c.name[prefix:], "pull": pull, "nested_path": listwrap(c.nested_path)[0], "put": { "name": s.name + "." + literal_field(c.name[prefix:]), "index": i, "child": "." } }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.es_column] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": { "name": ".", "index": i, "child": c.es_column } }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value.var, "pull": "_id", "put": { "name": s.name, "index": i, "child": "." } }) i += 1 else: column = columns[(s.value.var, )] parent = column.es_column + "." prefix = len(parent) net_columns = [ c for c in columns if c.es_column.startswith(parent) and c.type not in ["object", "nested"] ] if not net_columns: pull = get_pull(column) if not column.nested_path: es_query.fields += [column.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(column.nested_path)[0], "put": { "name": s.name, "index": i, "child": "." } }) else: done = set() for n in net_columns: # THE COLUMNS CAN HAVE DUPLICATE REFERNCES TO THE SAME ES_COLUMN if n.es_column in done: continue done.add(n.es_column) pull = get_pull(n) if not n.nested_path: es_query.fields += [n.es_column] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(n.nested_path)[0], "put": { "name": s.name, "index": i, "child": n.es_column[prefix:] } }) i += 1 else: expr = s.value for v in expr.vars(): for n in columns: if n.name == v: if not n.nested_path: es_query.fields += [n.es_column] pull = EXPRESSION_PREFIX + s.name post_expressions[pull] = compile_expression( expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.to_dict(), "put": { "name": s.name, "index": i, "child": "." } }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append( es09.util.post(es, Dict(filter=more_filter, fields=es_query.fields), query.limit)) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)