def test_extract_job(complex_job, extract_job_settings): """ If you find this test failing, then copy the JSON in the test failure into the test_extract_job.json file, then you may use the diff to review the changes. """ with MySQL(extract_job_settings.source.database) as source: with MySqlSnowflakeExtractor(extract_job_settings.source) as extractor: sql = extractor.get_sql( SQL("SELECT " + text(complex_job.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) doc = first(acc) doc.guid = first(JOB).guid # NEW EACH TIME job_guid = first(jx.drill(JOB, "job_log.failure_line.job_guid")) for fl in jx.drill(doc, "job_log.failure_line"): fl.job_guid = job_guid assertAlmostEqual( acc, JOB, places= 4, # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603 )
def __getitem__(self, item): # TODO: SOLVE FUNDAMENTAL QUESTION OF IF SELECTING A PART OF AN # EDGE REMOVES THAT EDGE FROM THIS RESULT, OR ADDS THE PART # AS A select {"name":edge.name, "value":edge.domain.partitions[coord]} # PROBABLY NOT, THE value IS IDENTICAL OVER THE REMAINING if is_data(item): coordinates = [None] * len(self.edges) # MAP DICT TO NUMERIC INDICES for name, v in item.items(): ei, parts = first((i, e.domain.partitions) for i, e in enumerate(self.edges) if e.name == name) if not parts: Log.error( "Can not find {{name}}=={{value|quote}} in list of edges, maybe this feature is not implemented yet", name=name, value=v) part = first(p for p in parts if p.value == v) if not part: return Null else: coordinates[ei] = part.dataIndex edges = [e for e, v in zip(self.edges, coordinates) if v is None] if not edges: # ZERO DIMENSIONAL VALUE return dict_to_data({ k: v.__getitem__(coordinates) for k, v in self.data.items() }) else: output = Cube(select=self.select, edges=list_to_data([ e for e, v in zip(self.edges, coordinates) if v is None ]), data={ k: Matrix(values=c.__getitem__(coordinates)) for k, c in self.data.items() }) return output elif is_text(item): # RETURN A VALUE CUBE if self.is_value: if item != self.select.name: Log.error("{{name}} not found in cube", name=item) return self if item not in self.select.name: Log.error("{{name}} not found in cube", name=item) output = Cube(select=first(s for s in self.select if s.name == item), edges=self.edges, data={item: self.data[item]}) return output else: Log.error("not implemented yet")
def filter(self, where): if len(self.edges) == 1 and first(self.edges).domain.type == "index": # USE THE STANDARD LIST FILTER from jx_python import jx return jx.filter(first(self.data.values()).cube, where) else: # FILTER DOES NOT ALTER DIMESIONS, JUST WHETHER THERE ARE VALUES IN THE CELLS Log.unexpected("Incomplete")
def get_decoders_by_path(query, schema): """ RETURN MAP FROM QUERY PATH TO LIST OF DECODER ARRAYS :param query: :return: """ output = {} if query.edges: if query.sort and query.format != "cube": # REORDER EDGES/GROUPBY TO MATCH THE SORT query.edges = sort_edges(query, "edges") elif query.groupby: if query.sort and query.format != "cube": query.groupby = sort_edges(query, "groupby") for edge in to_data(coalesce(query.edges, query.groupby, [])): limit = coalesce(edge.domain.limit, query.limit, DEFAULT_LIMIT) vars_ = coalesce(edge.value.vars(), set()) if edge.range: vars_ |= edge.range.min.vars() | edge.range.max.vars() for v in vars_: if not schema[v.var]: Log.error("{{var}} does not exist in schema", var=v) elif edge.domain.dimension: vars_ |= set(Variable(v) for v in edge.domain.dimension.fields) edge.domain.dimension = edge.domain.dimension.copy() edge.domain.dimension.fields = [ schema[v.var].es_column for v in vars_ ] elif edge.domain.partitions.where and all( edge.domain.partitions.where): for p in edge.domain.partitions: vars_ |= p.where.vars() else: # SIMPLE edge.value decoder = AggsDecoder(edge, query, limit) depths = set(c.nested_path[0] for v in vars_ for c in schema.leaves(v.var)) output.setdefault(first(depths), []).append(decoder) continue depths = set(c.nested_path[0] for v in vars_ for c in schema.leaves(v.var)) if not depths: Log.error("Do not know of column {{column}}", column=unwraplist( [v for v in vars_ if schema[v.var] == None])) if len(depths) > 1: Log.error("expression {{expr|quote}} spans tables, can not handle", expr=edge.value) decoder = AggsDecoder(edge, query, limit) output.setdefault(first(depths), []).append(decoder) return output
def append_query(self, query_path, es_query): domain = self.domain domain_key = domain.key value = Painless[self.edge.value] cnv = pull_functions[value.type] include = tuple(cnv(p[domain_key]) for p in domain.partitions) exists = Painless[AndOp([ InOp([value, Literal(include)]) ])].partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if is_op(value, Variable): es_field = first(self.query.frum.schema.leaves(value.var)).es_column # ALREADY CHECKED THERE IS ONLY ONE match = TermsAggs( "_match", { "field": es_field, "size": limit, "order": {"_term": self.sorted} if self.sorted else None }, self ) else: match = TermsAggs( "_match", { "script": text_type(value.to_es_script(self.schema)), "size": limit }, self ) output = Aggs().add(FilterAggs("_filter", exists, None).add(match.add(es_query))) if self.edge.allowNulls: # FIND NULLS AT EACH NESTED LEVEL for p in self.schema.query_path: if p == query_path: # MISSING AT THE QUERY DEPTH output.add( NestedAggs(p).add(FilterAggs("_missing0", NotOp(exists), self).add(es_query)) ) else: # PARENT HAS NO CHILDREN, SO MISSING column = first(self.schema.values(query_path, (OBJECT, EXISTS))) output.add( NestedAggs(column.nested_path[0]).add( FilterAggs( "_missing1", NotOp(ExistsOp(Variable(column.es_column.replace(NESTED_TYPE, EXISTS_TYPE)))), self ).add(es_query) ) ) return output
def append_query(self, query_path, es_query): domain = self.domain domain_key = domain.key value = Painless[self.edge.value] cnv = pull_functions[value.type] include = tuple(cnv(p[domain_key]) for p in domain.partitions) exists = Painless[AndOp([InOp([value, Literal(include)])])].partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if is_op(value, Variable): es_field = first(self.query.frum.schema.leaves( value.var)).es_column # ALREADY CHECKED THERE IS ONLY ONE match = TermsAggs( "_match", { "field": es_field, "size": limit, "order": { "_term": self.sorted } if self.sorted else None }, self) else: match = TermsAggs( "_match", { "script": text_type(value.to_es_script(self.schema)), "size": limit }, self) output = Aggs().add( FilterAggs("_filter", exists, None).add(match.add(es_query))) if self.edge.allowNulls: # FIND NULLS AT EACH NESTED LEVEL for p in self.schema.query_path: if p == query_path: # MISSING AT THE QUERY DEPTH output.add( NestedAggs(p).add( FilterAggs("_missing0", NotOp(exists), self).add(es_query))) else: # PARENT HAS NO CHILDREN, SO MISSING column = first( self.schema.values(query_path, (OBJECT, EXISTS))) output.add( NestedAggs(column.nested_path[0]).add( FilterAggs( "_missing1", NotOp( ExistsOp( Variable( column.es_column.replace( NESTED_TYPE, EXISTS_TYPE)))), self).add(es_query))) return output
def output(): while True: with self.db.transaction() as t: top_id = first( first( t.query(SQL_SELECT + quote_column("next_id") + SQL_FROM + quote_column(ABOUT_TABLE)).data)) max_id = top_id + 1000 t.execute(SQL_UPDATE + quote_column(ABOUT_TABLE) + SQL_SET + sql_eq(next_id=max_id)) while top_id < max_id: yield top_id top_id += 1
def is_bulk_agg(esq, query): # ONLY ACCEPTING ONE DIMENSION AT THIS TIME if not S3_CONFIG: return False if query.destination not in {"s3", "url"}: return False if query.format not in {"list", "table"}: return False if len(listwrap(query.groupby)) != 1: return False gb = first(_normalize_group(first(listwrap(query.groupby)), 0, query.limit)) if not is_op(gb.value, Variable): return False return True
def append_query(self, query_path, es_query): if is_op(self.edge.value, FirstOp) and is_op(self.edge.value.term, Variable): self.edge.value = self.edge.value.term # ES USES THE FIRST TERM FOR {"terms": } AGGREGATION if not is_op(self.edge.value, Variable): terms = TermsAggs( "_match", { "script": {"lang": "painless", "inline": self.script.expr}, "size": self.domain.limit, "order": self.es_order }, self ) else: terms = TermsAggs( "_match", { "field": first(self.schema.leaves(self.edge.value.var)).es_column, "size": self.domain.limit, "order": self.es_order }, self ) output = Aggs() output.add(FilterAggs("_filter", self.exists, None).add(terms.add(es_query))) output.add(FilterAggs("_missing", self.missing, self).add(es_query)) return output
def append_query(self, query_path, es_query): es_field = first(self.query.frum.schema.leaves(self.var)).es_column return Aggs().add(TermsAggs("_match", { "script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'}), "size": self.limit }, self).add(es_query))
def aggop_to_es_queries(select, query_path, schema, query): base_agg = extract_aggs(select, query_path, schema) base_agg = NestedAggs(query_path).add(base_agg) all_paths, split_decoders, var_to_columns = pre_process(query) # WE LET EACH DIMENSION ADD ITS OWN CODE FOR HANDLING INNER JOINS concat_outer = query_to_outer_joins(query, all_paths, {}, var_to_columns) start = 0 decoders = [None] * (len(query.edges) + len(query.groupby)) output = NestedAggs(".") for i, outer in enumerate(concat_outer.terms): acc = base_agg for p, path in enumerate(all_paths): decoder = split_decoders.get(path, Null) for d in decoder: decoders[d.edge.dim] = d acc = d.append_query(path, acc) start += d.num_columns where = first(nest.where for nest in outer.nests if nest.path == path).partial_eval() if where is FALSE: continue elif not where or where is TRUE: pass else: acc = FilterAggs("_filter" + text(i) + text(p), where, None).add(acc) acc = NestedAggs(path).add(acc) output.add(acc) output = simplify(output) es_query = to_data(output.to_es(schema)) es_query.size = 0 return output, decoders, es_query
def _gen_ids(): while True: with db.transaction() as t: top_id = first( first( t.query( sql_query({ "select": "next_id", "from": version_table })).data)) max_id = top_id + 1000 t.execute(SQL_UPDATE + quote_column(version_table) + SQL_SET + sql_eq(next_id=max_id)) while top_id < max_id: yield top_id top_id += 1
def to_esfilter(self, schema): if is_op(self.value, Variable_): var = self.value.var cols = schema.leaves(var) if not cols: return MATCH_NONE col = first(cols) var = col.es_column if col.jx_type == BOOLEAN: if is_literal( self.superset) and not is_many(self.superset.value): return {"term": {var: value2boolean(self.superset.value)}} else: return { "terms": { var: map(value2boolean, self.superset.value) } } else: if is_literal( self.superset) and not is_many(self.superset.value): return {"term": {var: self.superset.value}} else: return {"terms": {var: self.superset.value}} else: return Painless[self].to_es_script(schema).to_esfilter(schema)
def get_or_create_user(self, details): details = wrap(details) issuer = details.sub or details.issuer email = details.email email_verified = details.email_verified if not email: Log.error("Expecting id_token to have claims.email propert") result = self.db.query( sql_query({ "select": ["_id", "email", "issuer"], "from": GROUP_TABLE, "where": { "eq": { "email": email, "issuer": issuer } }, })) if result.data: user = Data(zip(result.header, first(result.data))) user.email_verified = email_verified return user new_user = wrap({ "email": email, "issuer": issuer, "email_verified": email_verified, "owner": ROOT_USER._id }) self._insert(GROUP_TABLE, new_user) return new_user
def select(self, select): selects = listwrap(select) if len(selects) == 1 and is_op(selects[0].value, Variable) and selects[0].value.var == ".": new_schema = self.schema if selects[0].name == ".": return self else: new_schema = None if is_list(select): if all( is_op(s.value, Variable) and s.name == s.value.var for s in select ): names = set(s.value.var for s in select) new_schema = Schema(".", [c for c in self.schema.columns if c.name in names]) push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def selector(d): output = Data() for n, p in push_and_pull: output[n] = unwraplist(p(wrap(d))) return unwrap(output) new_data = map(selector, self.data) else: select_value = jx_expression_to_function(select.value) new_data = map(select_value, self.data) if is_op(select.value, Variable): column = copy(first(c for c in self.schema.columns if c.name == select.value.var)) column.name = '.' new_schema = Schema("from " + self.name, [column]) return ListContainer("from "+self.name, data=new_data, schema=new_schema)
def select(self, select): selects = listwrap(select) if len(selects) == 1 and is_op(selects[0].value, Variable) and selects[0].value.var == ".": new_schema = self.schema if selects[0].name == ".": return self else: new_schema = None if is_list(select): if all( is_op(s.value, Variable) and s.name == s.value.var for s in select ): names = set(s.value.var for s in select) new_schema = Schema(".", [c for c in self.schema.columns if c.name in names]) push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def selector(d): output = Data() for n, p in push_and_pull: output[n] = unwraplist(p(to_data(d))) return unwrap(output) new_data = list(map(selector, self.data)) else: select_value = jx_expression_to_function(select.value) new_data = list(map(select_value, self.data)) if is_op(select.value, Variable): column = dict(**first(c for c in self.schema.columns if c.name == select.value.var)) column.update({"name": ".", "jx_type": NESTED, "es_type": "nested", "multi":1001, "cardinality":1}) new_schema = Schema("from " + self.name, [Column(**column)]) return ListContainer("from "+self.name, data=new_data, schema=new_schema)
def download_perfherder(desc, repo, id, dummy, framework): sig_result = http.get_json( "https://treeherder.mozilla.org/api/project/" + repo + "/performance/signatures/?format=json&framework=" + str(framework) + "&id=" + str(id) ) signature = first(sig_result.keys()) data_result = http.get_json( "https://treeherder.mozilla.org/api/project/" + repo + "/performance/data/?signatures=" + signature ) Log.note( "{{result|json}}", result={ "name": desc, "data": jx.run({ "from": ListContainer("data", data_result[signature]), "sort": "push_timestamp", "select": "value" }).data }, )
def append_query(self, query_path, es_query): if is_op(self.edge.value, FirstOp) and is_op(self.edge.value.term, Variable): self.edge.value = self.edge.value.term # ES USES THE FIRST TERM FOR {"terms": } AGGREGATION if not is_op(self.edge.value, Variable): terms = TermsAggs( "_match", { "script": { "lang": "painless", "inline": self.script.expr }, "size": self.domain.limit, "order": self.es_order }, self) else: terms = TermsAggs( "_match", { "field": first(self.schema.leaves( self.edge.value.var)).es_column, "size": self.domain.limit, "order": self.es_order }, self) output = Aggs() output.add( FilterAggs("_filter", self.exists, None).add(terms.add(es_query))) output.add(FilterAggs("_missing", self.missing, self).add(es_query)) return output
def select(self, select): selects = listwrap(select) if len(selects) == 1 and is_op(selects[0].value, Variable) and selects[0].value.var == ".": new_schema = self.schema if selects[0].name == ".": return self else: new_schema = None if is_list(select): if all( is_op(s.value, Variable) and s.name == s.value.var for s in select ): names = set(s.value.var for s in select) new_schema = Schema(".", [c for c in self.schema.columns if c.name in names]) push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def selector(d): output = Data() for n, p in push_and_pull: output[n] = unwraplist(p(wrap(d))) return unwrap(output) new_data = map(selector, self.data) else: select_value = jx_expression_to_function(select.value) new_data = map(select_value, self.data) if is_op(select.value, Variable): column = copy(first(c for c in self.schema.columns if c.name == select.value.var)) column.name = '.' new_schema = Schema("from " + self.name, [column]) return ListContainer("from "+self.name, data=new_data, schema=new_schema)
def window(self, window): if window.edges or window.sort: raise NotImplementedError() from jx_python import jx # SET OP canonical = first(self.data.values()) accessor = jx.get(window.value) cnames = self.data.keys() # ANNOTATE EXISTING CUBE WITH NEW COLUMN m = self.data[window.name] = Matrix(dims=canonical.dims) for coord in canonical._all_combos(): row = Data( ) # IT IS SAD WE MUST HAVE A Data(), THERE ARE {"script": expression} USING THE DOT NOTATION for k in cnames: row[k] = self.data[k][coord] for c, e in zip(coord, self.edges): row[e.name] = e.domain.partitions[c] m[coord] = accessor( row, Null, Null) # DUMMY Null VALUES BECAUSE I DO NOT KNOW WHAT TO DO self.select.append(window) return self
def type(self): types = set(w.then.type if is_op(w, WhenOp) else w.type for w in self.whens) if len(types) > 1: return OBJECT else: return first(types)
def to_esfilter(self, schema): if is_op(self.value, Variable_): var = self.value.var cols = schema.leaves(var) if not cols: Log.error("expecting {{var}} to be a column", var=var) col = first(cols) var = col.es_column if col.jx_type == BOOLEAN: if is_literal(self.superset) and not is_sequence( self.superset.value): return {"term": {var: value2boolean(self.superset.value)}} else: return { "terms": { var: map(value2boolean, self.superset.value) } } else: if is_literal(self.superset) and not is_sequence( self.superset.value): return {"term": {var: self.superset.value}} else: return {"terms": {var: self.superset.value}} else: return Painless[self].to_es_script(schema).to_esfilter(schema)
def _range_composer(self, edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) output = Aggs() if edge.allowNulls: output.add( FilterAggs( "_missing", NotOp( AndOp([ edge.value.exists(), GteOp([edge.value, Literal(to_float(_min))]), LtOp([edge.value, Literal(to_float(_max))]) ]).partial_eval()), self).add(es_query)) if is_op(edge.value, Variable): calc = {"field": first(schema.leaves(edge.value.var)).es_column} else: calc = {"script": text_type(Painless[edge.value].to_es_script(schema))} calc['ranges'] = [{ "from": to_float(p.min), "to": to_float(p.max) } for p in domain.partitions] return output.add(RangeAggs("_match", calc, self).add(es_query))
def verify_jwt_token(self, token): jwks = http.get_json("https://" + self.auth0.domain + "/.well-known/jwks.json") unverified_header = jwt.get_unverified_header(token) algorithm = unverified_header["alg"] if algorithm != "RS256": Log.error("Expecting a RS256 signed JWT Access Token") key_id = unverified_header["kid"] key = unwrap(first(key for key in jwks["keys"] if key["kid"] == key_id)) if not key: Log.error("could not find {{key}}", key=key_id) try: return jwt.decode( token, key, algorithms=algorithm, audience=self.auth0.api.identifier, issuer="https://" + self.auth0.domain + "/", ) except jwt.ExpiredSignatureError as e: Log.error("Token has expired", code=403, cause=e) except jwt.JWTClaimsError as e: Log.error( "Incorrect claims, please check the audience and issuer", code=403, cause=e, ) except Exception as e: Log.error("Problem parsing", cause=e)
def to_es(self, schema): if is_op(self.lhs, Variable_) and is_literal(self.rhs): lhs = self.lhs.var cols = schema.leaves(lhs) if cols: lhs = first(cols).es_column rhs = self.rhs.value if is_many(rhs): if len(rhs) == 1: return {"term": {lhs: first(rhs)}} else: return {"terms": {lhs: rhs}} else: return {"term": {lhs: rhs}} else: return Painless[self].to_es_script(schema).to_es(schema)
def to_es(self, schema): if not self.suffix: return MATCH_ALL elif is_op(self.expr, Variable_) and is_literal(self.suffix): var = first(schema.leaves(self.expr.var)).es_column return {"regexp": {var: ".*" + string2regexp(self.suffix.value)}} else: return PainlessSuffixOp.to_es_script(self, schema).to_es(schema)
def define(cls, expr): term = expr.get('prefix') if not term: return PrefixOp(NULL, NULL) elif is_data(term): expr, const = first(term.items()) return PrefixOp(Variable(expr), Literal(const)) else: return PrefixOp(*term)
def sql_lt(**item): """ RETURN SQL FOR LESS-THAN (<) COMPARISION BETWEEN VARIABLES TO VALUES :param item: keyword parameters representing variable and value :return: SQL """ k, v = first(item.items()) return ConcatSQL(quote_column(k), SQL_LT, quote_value(v))
def define(cls, expr): term = expr.get("prefix") if not term: return PrefixOp(NULL, NULL) elif is_data(term): expr, const = first(term.items()) return PrefixOp(Variable(expr), Literal(const)) else: expr, const = term return PrefixOp(jx_expression(expr), jx_expression(const))
def __init__(self, terms, **clauses): Expression.__init__(self, terms) if is_data(terms): self.terms = first(terms.items()) else: self.terms = terms self.separator = clauses.get(str("separator"), Literal("")) self.default = clauses.get(str("default"), NULL) if not is_literal(self.separator): Log.error("Expecting a literal separator")
def to_esfilter(self, schema): if not self.value: return MATCH_ALL elif is_op(self.value, Variable_) and is_literal(self.prefix): var = first(schema.leaves(self.value.var)).es_column return {"prefix": {var: self.prefix.value}} else: output = PainlessBasicStartsWithOp.self.to_es_script(self, schema) if output is false_script: return MATCH_NONE return output
def to_esfilter(self, schema): if is_literal(self.pattern) and is_op(self.var, Variable_): cols = schema.leaves(self.var.var) if len(cols) == 0: return MATCH_NONE elif len(cols) == 1: return {"regexp": {first(cols).es_column: self.pattern.value}} else: Log.error("regex on not supported ") else: Log.error("regex only accepts a variable and literal pattern")
def __data__(self): if first(self.schema.columns).name=='.': return wrap({ "meta": {"format": "list"}, "data": self.data }) else: return wrap({ "meta": {"format": "list"}, "data": [{k: unwraplist(v) for k, v in row.items()} for row in self.data] })
def to_es(self, schema): if is_op(self.expr, Variable_): cols = schema.leaves(self.expr.var) if not cols: return MATCH_ALL elif len(cols) == 1: return es_missing(first(cols).es_column) else: return es_and([es_missing(c.es_column) for c in cols]) else: return PainlessMissingOp.to_es_script(self, schema).to_es(schema)
def append_query(self, query_path, es_query): decoder = self for i, v in enumerate(self.fields): exists = v.exists().partial_eval() nest = Aggs() nest.add(TermsAggs("_match", { "field": first(self.schema.leaves(v.var)).es_column, "size": self.domain.limit }, decoder).add(es_query)) nest.add(FilterAggs("_missing", NotOp(exists), decoder).add(es_query)) es_query = nest decoder = None if self.domain.where: es_query = FilterAggs("_filter", self.domain.where, None).add(es_query) return es_query
def _range_composer(self, edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) output = Aggs() if edge.allowNulls: output.add(FilterAggs( "_missing", NotOp(AndOp([ edge.value.exists(), GteOp([edge.value, Literal(to_float(_min))]), LtOp([edge.value, Literal(to_float(_max))]) ]).partial_eval()), self ).add(es_query)) if is_op(edge.value, Variable): calc = {"field": first(schema.leaves(edge.value.var)).es_column} else: calc = {"script": text_type(Painless[edge.value].to_es_script(schema))} calc['ranges'] = [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions] return output.add(RangeAggs("_match", calc, self).add(es_query))
def es_setop(es, query): schema = query.frum.schema query_path = schema.query_path[0] split_select = {".": ESSelect('.')} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelect(path) return es_select selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field(select.name, relative_field(untype_path(c.name), term.var)) if c.jx_type == NESTED: get_select('.').use_source = True new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."}, "pull": get_pull_source(c.es_column) }) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."} }) put_index += 1 elif is_op(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select('.').use_source = True new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source(".") }) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select('.').use_source = True for c in leaves: if len(c.nested_path) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": "."}, "pull": lambda row: row._id }) elif c.jx_type == NESTED: get_select('.').use_source = True pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}, "pull": get_pull_source(c.es_column) }) else: get_select(c_nested_path).fields.append(c.es_column) pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))} }) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field(untype_path(relative_field(c.name, schema.query_path[0])), s_column) pull = accumulate_nested_doc(c_nested_path, Variable(relative_field(s_column, unnest_path(c_nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 else: split_scripts = split_expression_by_path(select.value, schema, lang=Painless) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = {"script": text_type(Painless[first(script)].partial_eval().to_es_script(schema))} new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 for n in new_select: if n.pull: continue elif is_op(n.value, Variable): if get_select('.').use_source: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") split_wheres = split_expression_by_path(query.where, schema, lang=ES52) es_query = es_query_proto(query_path, split_select, split_wheres, schema) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort, schema) with Timer("call to ES", silent=True) as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits # Log.note("{{output}}", output=T) try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter", silent=True): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def append_query(self, query_path, es_query): es_field = first(self.query.frum.schema.leaves(self.var)).es_column return Aggs().add(TermsAggs("_match", { "script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'}) }, self).add(es_query))
def __new__(cls, e=None, query=None, *args, **kwargs): e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": # if query.groupby: # return object.__new__(DefaultDecoder, e) if is_text(e.value): Log.error("Expecting Variable or Expression, not plain string") if is_op(e.value, LeavesOp): return object.__new__(ObjectDecoder) elif is_op(e.value, TupleOp): # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS if not all(is_op(t, Variable) for t in e.value.terms): Log.error("Can only handle variables in tuples") e.domain = Data( dimension={"fields": e.value.terms} ) return object.__new__(DimFieldListDecoder) elif is_op(e.value, Variable): schema = query.frum.schema cols = schema.leaves(e.value.var) if not cols: return object.__new__(DefaultDecoder) if len(cols) != 1: return object.__new__(ObjectDecoder) col = first(cols) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.cardinality == None: DEBUG and Log.warning( "metadata for column {{name|quote}} (id={{id}}) is not ready", name=concat_field(col.es_index, col.es_column), id=id(col) ) e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) elif col.partitions == None: e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) else: DEBUG and Log.note("id={{id}} has parts!!!", id=id(col)) if col.multi > 1 and len(col.partitions) < 10: return object.__new__(MultivalueDecoder) partitions = col.partitions[:limit:] if e.domain.sort == -1: partitions = list(reversed(sorted(partitions))) else: partitions = sorted(partitions) e.domain = SimpleSetDomain(partitions=partitions, limit=limit) else: return object.__new__(DefaultDecoder) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder) if e.range: return object.__new__(GeneralRangeDecoder) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if is_data(fields): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)