def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index]+split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias]+split_field(query_path[0])) self._upsert_column(c)
def _get_schema_from_list(frum, columns, prefix, nested_path, name_to_column): """ SCAN THE LIST FOR COLUMN TYPES """ for d in frum: row_type = _type_to_name[d.__class__] if row_type != "object": full_name = join_field(prefix) column = name_to_column.get(full_name) if not column: column = Column( name=full_name, table=".", es_column=full_name, es_index=".", type="undefined", nested_path=nested_path ) columns[full_name] = column column.type = _merge_type[column.type][row_type] else: for name, value in d.items(): full_name = join_field(prefix + [name]) column = name_to_column.get(full_name) if not column: column = Column( name=full_name, table=".", es_column=full_name, es_index=".", type="undefined", nested_path=nested_path ) columns[full_name] = column if isinstance(value, list): if len(value)==0: this_type = "undefined" elif len(value)==1: this_type = _type_to_name[value[0].__class__] else: this_type = _type_to_name[value[0].__class__] if this_type == "object": this_type = "nested" else: this_type = _type_to_name[value.__class__] new_type = _merge_type[column.type][this_type] column.type = new_type if this_type == "object": _get_schema_from_list([value], columns, prefix + [name], nested_path, name_to_column) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist([join_field(split_field(np[0])+[name])]+np) _get_schema_from_list(value, columns, prefix + [name], newpath, name_to_column)
def getFrameVariables(self, body): contextVariables = [] columns = self.fromData.columns parentVarNames = set() # ALL PARENTS OF VARIABLES WITH "." IN NAME body = body.replace(".?", ".") for i, c in enumerate(columns): j = body.find(c.name, 0) while j >= 0: s = j j = body.find(c.name, s + 1) test0 = body[s - 1: s + len(c.name) + 1:] test3 = body[s - 8: s + len(c.name):] if test0[:-1] == "\"" + c.name: continue if test3 == "_source." + c.name: continue def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n") body = body.replace(c.name, "-"*len(c.name)) if self.isLean or c.useSource: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getSourceValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = _source[\"" + c.name + "\"];\n") else: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") break return "".join(contextVariables)
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce(kwargs.depth, len(self.fields)-1 if isinstance(self.fields, list) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = FlatList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name":part.name, "value":part.value, "where":part.where, "style":coalesce(part.style, part.parent.style), "weight":part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name":v.name, "value":v.value, "where":v.where, "style":v.style, "weight":v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT)] elif kwargs.depth == 1: partitions = FlatList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name":join_field(split_field(subpart.parent.name) + [subpart.name]), "value":subpart.value, "where":subpart.where, "style":coalesce(subpart.style, subpart.parent.style), "weight":subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception, e: Log.error("", e)
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception, e: Log.error("{{name}} does not exist", name=fieldname) if isinstance(d, list) and len(col) > 1: if len(primary_column) <= depth+i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth+i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth+i] = True primary_column[depth+i] = c primary_branch[depth+i] = d return c, join_field(col[i+1:]) else: if len(primary_column) <= depth+i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d])
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table( name=short_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception, e: Log.error("Not expected", cause=e)
def set(constants): """ REACH INTO THE MODULES AND OBJECTS TO SET CONSTANTS. THINK OF THIS AS PRIMITIVE DEPENDENCY INJECTION FOR MODULES. USEFUL FOR SETTING DEBUG FLAGS. """ if not constants: return constants = wrap(constants) for k, new_value in constants.leaves(): errors = [] try: old_value = pyDots.set_attr(sys.modules, k, new_value) continue except Exception, e: errors.append(e) # ONE MODULE IS MISSING, THE CALLING MODULE try: caller_globals = sys._getframe(1).f_globals caller_file = caller_globals["__file__"] if not caller_file.endswith(".py"): raise Exception("do not know how to handle non-python caller") caller_module = caller_file[:-3].replace("/", ".") path = split_field(k) for i, p in enumerate(path): if i == 0: continue prefix = join_field(path[:1]) name = join_field(path[i:]) if caller_module.endswith(prefix): old_value = pyDots.set_attr(caller_globals, name, new_value) if DEBUG: from pyLibrary.debugs.logs import Log Log.note("Changed {{module}}[{{attribute}}] from {{old_value}} to {{new_value}}", module= prefix, attribute= name, old_value= old_value, new_value= new_value) break except Exception, e: errors.append[e]
def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n")
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not _containers: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not _containers.config.default.settings: Log.error("expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info") type_ = None index = frum if frum.startswith("meta."): if frum == "meta.columns": return _meta.singlton.meta.columns elif frum == "meta.tables": return _meta.singlton.meta.tables else: Log.error("{{name}} not a recognized table", name=frum) else: type_ = _containers.config.default.type index = join_field(split_field(frum)[:1:]) settings = set_default( { "index": index, "name": frum }, _containers.config.default.settings ) settings.type = None return _containers.type2container[type_](settings) elif isinstance(frum, Mapping) and frum.type and _containers.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return _containers.type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import QueryOp return QueryOp.wrap(frum, schema=schema) elif isinstance(frum, (list, set)): return _ListContainer("test_list", frum) else: return frum
def new_instance(type, frum, schema=None): """ Factory! """ if not type2container: _delayed_imports() if isinstance(frum, Container): return frum elif isinstance(frum, _Cube): return frum elif isinstance(frum, _Query): return _run(frum) elif isinstance(frum, (list, set, GeneratorType)): return _ListContainer(frum) elif isinstance(frum, basestring): # USE DEFAULT STORAGE TO FIND Container if not config.default.settings: Log.error("expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info") settings = set_default( { "index": join_field(split_field(frum)[:1:]), "name": frum, }, config.default.settings ) settings.type = None # WE DO NOT WANT TO INFLUENCE THE TYPE BECAUSE NONE IS IN THE frum STRING ANYWAY return type2container["elasticsearch"](settings) elif isinstance(frum, Mapping): frum = wrap(frum) if frum.type and type2container[frum.type]: return type2container[frum.type](frum.settings) elif frum["from"]: frum = copy(frum) frum["from"] = Container(frum["from"]) return _Query.wrap(frum) else: Log.error("Do not know how to handle {{frum|json}}", frum=frum) else: Log.error("Do not know how to handle {{type}}", type=frum.__class__.__name__)
def compile_expression(self, expression, constants=None): # EXPAND EXPRESSION WITH ANY CONSTANTS expression = setValues(expression, constants) fromPath = self.fromData.name # FIRST NAME IS THE INDEX indexName = join_field(split_field(fromPath)[:1:]) context = self.getFrameVariables(expression) if context == "": return addFunctions(expression).head+expression func = UID() code = addFunctions(context+expression) output = code.head + \ 'var ' + func + ' = function(' + indexName + '){\n' + \ context + \ expression + ";\n" + \ '};\n' + \ func + '(_source)\n' return Compiled(output)
def es_query_template(path): """ RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE :param path: :return: """ sub_path = split_field(path)[1:] if sub_path: f0 = {} f1 = {} output = wrap({ "filter": {"and": [ f0, {"nested": { "path": join_field(sub_path), "filter": f1, "inner_hits": {"size": 100000} }} ]}, "from": 0, "size": 0, "sort": [] }) return output, wrap([f0, f1]) else: f0 = {} output = wrap({ "query": {"filtered": { "filter": f0 }}, "from": 0, "size": 0, "sort": [] }) return output, wrap([f0])
def __init__(self, dim, parent, jx): dim = wrap(dim) self.name = dim.name self.parent = coalesce(parent) self.full_name = join_field(split_field(self.parent.full_name)+[self.name]) self.edges = None # FOR NOW dot.set_default(self, dim) self.where = dim.where self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, jx.settings.index) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Data() for e in listwrap(dim.edges): new_e = Dimension(e, self, jx) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif isinstance(fields, Mapping): self.fields = wrap(fields) edges = wrap([{"name": k, "value": v, "allowNulls": False} for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{"name": f, "value": f, "index": i, "allowNulls": False} for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if self.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH jx.get_columns() with Timer("Get parts of {{name}}", {"name": self.name}): parts = jx.query({ "from": self.index, "select": {"name": "count", "aggregate": "count"}, "edges": edges, "where": self.where, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name= self.name, num= len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Data(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not isinstance(a, list): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts( temp, dim.path(d.getEnd(d.partitions[i])), count, 0 ) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif isinstance(fields, Mapping): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = FlatList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "where": {"and": [ {"term": {e.value: g[e.name]}} for e in edges ]}, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values()[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if isinstance(fields, Mapping): output = Data() for e, v in zip(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "where": {"and": [ {"term": {edges[0].value: d.partitions[i].value}}, {"term": {edges[1].value: d2.partitions[j].value}} ]}, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
def query_path(self): return join_field(split_field(self.name)[1:])
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) selects = wrap([s.copy() for s in listwrap(query.select)]) new_select = FlatList() columns = query.frum.get_columns() leaf_columns = set(c.name for c in columns if c.type not in STRUCT and (c.nested_path[0] == "." or c.es_column == c.nested_path)) nested_columns = set(c.name for c in columns if len(c.nested_path) != 1) i = 0 source = "fields" for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp): term = select.value.term if isinstance(term, Variable): if term.var == ".": es_query.fields = None source = "_source" net_columns = leaf_columns - set(selects.name) for n in net_columns: new_select.append({ "name": n, "value": Variable(n), "put": {"name": n, "index": i, "child": "."} }) i += 1 else: parent = term.var + "." prefix = len(parent) for c in leaf_columns: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": select.name + "." + c[prefix:], "value": Variable(c), "put": {"name": select.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(select.value, Variable): if select.value.var == ".": es_query.fields = None source = "_source" new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": i, "child": "."} }) i += 1 elif select.value.var == "_id": new_select.append({ "name": select.name, "value": select.value, "pull": "_id", "put": {"name": select.name, "index": i, "child": "."} }) i += 1 elif select.value.var in nested_columns or [c for c in nested_columns if c.startswith(select.value.var+".")]: es_query.fields = None source = "_source" new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": i, "child": "."} }) i += 1 else: parent = select.value.var + "." prefix = len(parent) net_columns = [c for c in leaf_columns if c.startswith(parent)] if not net_columns: # LEAF if es_query.fields is not None: es_query.fields.append(select.value.var) new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": i, "child": "."} }) else: # LEAVES OF OBJECT for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": select.name, "value": Variable(n), "put": {"name": select.name, "index": i, "child": n[prefix:]} }) i += 1 else: es_query.script_fields[literal_field(select.name)] = {"script": select.value.to_ruby()} new_select.append({ "name": select.name, "pull": "fields." + literal_field(select.name), "put": {"name": select.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value.var)) elif isinstance(n.value, Variable): n.pull = "fields." + literal_field(n.value.var) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def parse_properties(parent_index_name, parent_name, esProperties): """ RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT """ from pyLibrary.queries.meta import Column columns = FlatList() for name, property in esProperties.items(): index_name = parent_index_name column_name = join_field(split_field(parent_name) + [name]) if property.type == "nested" and property.properties: # NESTED TYPE IS A NEW TYPE DEFINITION # MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH self_columns = parse_properties(index_name, column_name, property.properties) for c in self_columns: c.nested_path = [column_name] + c.nested_path columns.extend(self_columns) columns.append(Column( table=index_name, es_index=index_name, name=column_name, es_column=column_name, type="nested", nested_path=ROOT_PATH )) continue if property.properties: child_columns = parse_properties(index_name, column_name, property.properties) columns.extend(child_columns) columns.append(Column( table=index_name, es_index=index_name, name=column_name, es_column=column_name, nested_path=ROOT_PATH, type="source" if property.enabled == False else "object" )) if property.dynamic: continue if not property.type: continue if property.type == "multi_field": property.type = property.fields[name].type # PULL DEFAULT TYPE for i, (n, p) in enumerate(property.fields.items()): if n == name: # DEFAULT columns.append(Column( table=index_name, es_index=index_name, name=column_name, es_column=column_name, nested_path=ROOT_PATH, type=p.type )) else: columns.append(Column( table=index_name, es_index=index_name, name=column_name + "\\." + n, es_column=column_name + "\\." + n, nested_path=ROOT_PATH, type=p.type )) continue if property.type in ["string", "boolean", "integer", "date", "long", "double"]: columns.append(Column( table=index_name, es_index=index_name, name=column_name, es_column=column_name, nested_path=ROOT_PATH, type=property.type )) if property.index_name and name != property.index_name: columns.append(Column( table=index_name, es_index=index_name, es_column=column_name, name=column_name, nested_path=ROOT_PATH, type=property.type )) elif property.enabled == None or property.enabled == False: columns.append(Column( table=index_name, es_index=index_name, name=column_name, es_column=column_name, nested_path=ROOT_PATH, type="source" if property.enabled==False else "object" )) else: Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=query_path) return columns