def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE alias_done = set() index = split_field(table)[0] query_path = split_field(table)[1:] metadata = self.default_es.get_metadata(index=index) for index, meta in qb.sort(metadata.indices.items(), {"value": 0, "sort": -1}): for _, properties in meta.mappings.items(): columns = _elasticsearch.parse_properties(index, None, properties.properties) columns = columns.filter(lambda r: not r.abs_name.startswith("other.") and not r.abs_name.startswith("previous_values.cf_")) # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED with Timer("upserting {{num}} columns", {"num": len(columns)}, debug=DEBUG): with self.columns.locker: for c in columns: # ABSOLUTE c.table = join_field([index]+query_path) self.upsert_column(c) for alias in meta.aliases: # ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS if alias in alias_done: continue alias_done.add(alias) c = copy(c) c.table = join_field([alias]+query_path) self.upsert_column(c)
def getFrameVariables(self, body): contextVariables = [] columns = self.fromData.columns parentVarNames = set() # ALL PARENTS OF VARIABLES WITH "." IN NAME body = body.replace(".?", ".") for i, c in enumerate(columns): j = body.find(c.name, 0) while j >= 0: s = j j = body.find(c.name, s + 1) test0 = body[s - 1:s + len(c.name) + 1:] test3 = body[s - 8:s + len(c.name):] if test0[:-1] == "\"" + c.name: continue if test3 == "_source." + c.name: continue def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n") body = body.replace(c.name, "-" * len(c.name)) if self.isLean or c.useSource: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getSourceValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = _source[\"" + c.name + "\"];\n") else: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") break return "".join(contextVariables)
def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index]+split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias]+split_field(query_path[0])) self._upsert_column(c)
def _get_nested_path(field, schema): if not INDEX_CACHE: _late_import() if is_keyword(field): field = join_field([schema.es.alias] + split_field(field)) for i, f in reverse(enumerate(split_field(field))): path = join_field(split_field(field)[0:i + 1:]) if path in INDEX_CACHE: return join_field(split_field(path)[1::]) return None
def _get_schema_from_list(frum, columns, prefix, nested_path, name_to_column): """ SCAN THE LIST FOR COLUMN TYPES """ for d in frum: row_type = _type_to_name[d.__class__] if row_type != "object": full_name = join_field(prefix) column = name_to_column.get(full_name) if not column: column = Column( name=full_name, table=".", es_column=full_name, es_index=".", type="undefined", nested_path=nested_path ) columns[full_name] = column column.type = _merge_type[column.type][row_type] else: for name, value in d.items(): full_name = join_field(prefix + [name]) column = name_to_column.get(full_name) if not column: column = Column( name=full_name, table=".", es_column=full_name, es_index=".", type="undefined", nested_path=nested_path ) columns[full_name] = column if isinstance(value, list): if len(value)==0: this_type = "undefined" elif len(value)==1: this_type = _type_to_name[value[0].__class__] else: this_type = _type_to_name[value[0].__class__] if this_type == "object": this_type = "nested" else: this_type = _type_to_name[value.__class__] new_type = _merge_type[column.type][this_type] column.type = new_type if this_type == "object": _get_schema_from_list([value], columns, prefix + [name], nested_path, name_to_column) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist([join_field(split_field(np[0])+[name])]+np) _get_schema_from_list(value, columns, prefix + [name], newpath, name_to_column)
def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index] + split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias] + split_field(query_path[0])) self._upsert_column(c)
def _get_schema_from_list(frum, columns, prefix, nested_path, name_to_column): """ SCAN THE LIST FOR COLUMN TYPES """ for d in frum: row_type = _type_to_name[d.__class__] if row_type != "object": full_name = join_field(prefix) column = name_to_column.get(full_name) if not column: column = Column(name=full_name, table=".", es_column=full_name, es_index=".", type="undefined", nested_path=nested_path) columns[full_name] = column column.type = _merge_type[column.type][row_type] else: for name, value in d.items(): full_name = join_field(prefix + [name]) column = name_to_column.get(full_name) if not column: column = Column(name=full_name, table=".", es_column=full_name, es_index=".", type="undefined", nested_path=nested_path) columns[full_name] = column if isinstance(value, list): if len(value) == 0: this_type = "undefined" elif len(value) == 1: this_type = _type_to_name[value[0].__class__] else: this_type = _type_to_name[value[0].__class__] if this_type == "object": this_type = "nested" else: this_type = _type_to_name[value.__class__] new_type = _merge_type[column.type][this_type] column.type = new_type if this_type == "object": _get_schema_from_list([value], columns, prefix + [name], nested_path, name_to_column) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist( [join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list(value, columns, prefix + [name], newpath, name_to_column)
def getFrameVariables(self, body): contextVariables = [] columns = self.fromData.columns parentVarNames = set() # ALL PARENTS OF VARIABLES WITH "." IN NAME body = body.replace(".?", ".") for i, c in enumerate(columns): j = body.find(c.name, 0) while j >= 0: s = j j = body.find(c.name, s + 1) test0 = body[s - 1: s + len(c.name) + 1:] test3 = body[s - 8: s + len(c.name):] if test0[:-1] == "\"" + c.name: continue if test3 == "_source." + c.name: continue def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n") body = body.replace(c.name, "-"*len(c.name)) if self.isLean or c.useSource: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getSourceValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = _source[\"" + c.name + "\"];\n") else: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") break return "".join(contextVariables)
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table( name=short_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Thread.sleep(seconds=1) return columns except Exception, e: Log.error("Not expected", cause=e)
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce(kwargs.depth, len(self.fields)-1 if isinstance(self.fields, list) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = DictList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name":part.name, "value":part.value, "where":part.where, "style":coalesce(part.style, part.parent.style), "weight":part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name":v.name, "value":v.value, "where":v.where, "style":v.style, "weight":v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT)] elif kwargs.depth == 1: partitions = DictList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name":join_field(split_field(subpart.parent.name) + [subpart.name]), "value":subpart.value, "where":subpart.where, "style":coalesce(subpart.style, subpart.parent.style), "weight":subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception, e: Log.error("", e)
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception, e: Log.error("{{name}} does not exist", name=fieldname) if isinstance(d, list) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1:]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d])
def es_query_template(path): """ RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE :param path: :return: """ sub_path = split_field(path)[1:] if sub_path: f0 = {} f1 = {} output = wrap( { "filter": { "and": [ f0, {"nested": {"path": join_field(sub_path), "filter": f1, "inner_hits": {"size": 100000}}}, ] }, "from": 0, "size": 0, "sort": [], } ) return output, wrap([f0, f1]) else: f0 = {} output = wrap({"query": {"filtered": {"filter": f0}}, "from": 0, "size": 0, "sort": []}) return output, wrap([f0])
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception, e: Log.error("{{name}} does not exist", name=fieldname) if isinstance(d, list) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1 :]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d])
def set(constants): """ REACH INTO THE MODULES AND OBJECTS TO SET CONSTANTS. THINK OF THIS AS PRIMITIVE DEPENDENCY INJECTION FOR MODULES. USEFUL FOR SETTING DEBUG FLAGS. """ if not constants: return constants = wrap(constants) for k, new_value in constants.leaves(): errors = [] try: old_value = dot.set_attr(sys.modules, k, new_value) continue except Exception, e: errors.append(e) # ONE MODULE IS MISSING, THE CALLING MODULE try: caller_globals = sys._getframe(1).f_globals caller_file = caller_globals["__file__"] if not caller_file.endswith(".py"): raise Exception("do not know how to handle non-python caller") caller_module = caller_file[:-3].replace("/", ".") path = split_field(k) for i, p in enumerate(path): if i == 0: continue prefix = join_field(path[:1]) name = join_field(path[i:]) if caller_module.endswith(prefix): old_value = dot.set_attr(caller_globals, name, new_value) if DEBUG: from pyLibrary.debugs.logs import Log Log.note("Changed {{module}}[{{attribute}}] from {{old_value}} to {{new_value}}", module= prefix, attribute= name, old_value= old_value, new_value= new_value) break except Exception, e: errors.append[e]
def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n")
def _test_mode_wait(query): """ WAIT FOR METADATA TO ARRIVE ON INDEX :param query: dict() OF REQUEST BODY :return: nothing """ try: m = meta.singlton now = Date.now() end_time = now + MINUTE # MARK COLUMNS DIRTY m.meta.columns.update({ "clear": ["partitions", "count", "cardinality", "last_updated"], "where": { "eq": { "table": join_field(split_field(query["from"])[0:1]) } } }) # BE SURE THEY ARE ON THE todo QUEUE FOR RE-EVALUATION cols = [ c for c in m.get_columns(table_name=query["from"], force=True) if c.type not in STRUCT ] for c in cols: Log.note("Mark {{column}} dirty at {{time}}", column=c.name, time=now) c.last_updated = now - TOO_OLD m.todo.push(c) while end_time > now: # GET FRESH VERSIONS cols = [ c for c in m.get_columns(table_name=query["from"]) if c.type not in STRUCT ] for c in cols: if not c.last_updated or c.cardinality == None: Log.note( "wait for column (table={{col.table}}, name={{col.name}}) metadata to arrive", col=c) break else: break Thread.sleep(seconds=1) for c in cols: Log.note( "fresh column name={{column.name}} updated={{column.last_updated|date}} parts={{column.partitions}}", column=c) except Exception, e: Log.warning("could not pickup columns", cause=e)
def _decode_object(index, parent_path, path, name2index, destination=None, expected_vars=NO_VARS): if destination is None: destination = {} nested_done = False while True: c, index = skip_whitespace(index) if c == b',': continue elif c == b'"': name, index = simple_token(index, c) c, index = skip_whitespace(index) if c != b':': Log.error("Expecting colon") c, index = skip_whitespace(index) child_expected = needed(name, expected_vars) if child_expected and nested_done: Log.error("Expected property found after nested json. Iteration failed.") full_path = join_field(split_field(parent_path)+ [name]) if path and (path[0] == full_path or path[0].startswith(full_path+".")): # THE NESTED PROPERTY WE ARE LOOKING FOR if path[0] == full_path: new_path = path[1:] else: new_path = path nested_done = True for j, i in _decode(index - 1, full_path, new_path, name2index, expected_vars=child_expected): index = i j = {name: j} for k, v in destination.items(): j.setdefault(k, v) yield j, index continue if child_expected: # SOME OTHER PROPERTY value, index = _decode_token(index, c, full_path, path, name2index, None, expected_vars=child_expected) destination[name] = value else: # WE DO NOT NEED THIS VALUE index = jump_to_end(index, c) continue elif c == "}": break if not nested_done: yield destination, index
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not _containers: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not _containers.config.default.settings: Log.error( "expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info" ) type_ = None index = frum if frum.startswith("meta."): if frum == "meta.columns": return _meta.singlton.meta.columns elif frum == "meta.tables": return _meta.singlton.meta.tables else: Log.error("{{name}} not a recognized table", name=frum) else: type_ = _containers.config.default.type index = join_field(split_field(frum)[:1:]) settings = set_default({ "index": index, "name": frum }, _containers.config.default.settings) settings.type = None return _containers.type2container[type_](settings) elif isinstance( frum, Mapping) and frum.type and _containers.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return _containers.type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import QueryOp return QueryOp.wrap(frum, schema=schema) elif isinstance(frum, (list, set)): return _ListContainer("test_list", frum) else: return frum
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not _containers: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not _containers.config.default.settings: Log.error("expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info") type_ = None index = frum if frum.startswith("meta."): if frum == "meta.columns": return _meta.singlton.meta.columns elif frum == "meta.tables": return _meta.singlton.meta.tables else: Log.error("{{name}} not a recognized table", name=frum) else: type_ = _containers.config.default.type index = join_field(split_field(frum)[:1:]) settings = set_default( { "index": index, "name": frum }, _containers.config.default.settings ) settings.type = None return _containers.type2container[type_](settings) elif isinstance(frum, Mapping) and frum.type and _containers.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return _containers.type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import QueryOp return QueryOp.wrap(frum, schema=schema) elif isinstance(frum, (list, set)): return _ListContainer("test_list", frum) else: return frum
def _inner(schema, parent_name, indent): more_lines = [] for k,v in schema.items(): full_name = join_field(split_field(parent_name)+[k]) details = indent+"* "+_md_code(full_name) if v.type: details += " - "+_md_italic(v.type) else: Log.error("{{full_name}} is missing type", full_name=full_name) if v.description: details += " " + v.description more_lines.append(details) if v.type in ["object", "array", "nested"]: more_lines.extend(_inner(v.properties, full_name, indent+" ")) return more_lines
def add_column(self, column): """ ADD COLUMN, IF IT DOES NOT EXIST ALREADY """ if column.name not in self.columns: self.columns[column.name] = {column} elif column.type not in [c.type for c in self.columns[column.name]]: self.columns[column.name].add(column) if column.type == "nested": nested_table_name = join_field(split_field(self.name) + split_field(column.name)) # MAKE THE TABLE table = Table_usingSQLite(nested_table_name, self.db, self.uid + [UID_PREFIX+"id"+unicode(len(self.uid))], exists=False) self.nested_tables[nested_table_name] = table else: self.db.execute( "ALTER TABLE " + quote_table(self.name) + " ADD COLUMN " + _quote_column(column) + " " + column.type )
def es_query_template(path): """ RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE :param path: :return: """ sub_path = split_field(path)[1:] if sub_path: f0 = {} f1 = {} output = wrap({ "filter": { "and": [ f0, { "nested": { "path": join_field(sub_path), "filter": f1, "inner_hits": { "size": 100000 } } } ] }, "from": 0, "size": 0, "sort": [] }) return output, wrap([f0, f1]) else: f0 = {} output = wrap({ "query": { "filtered": { "filter": f0 } }, "from": 0, "size": 0, "sort": [] }) return output, wrap([f0])
def _get_schema_from_list(frum, columns, prefix, nested_path): """ SCAN THE LIST FOR COLUMN TYPES """ names = {} for d in frum: row_type = _type_to_name[d.__class__] if row_type!="object": agg_type = names.get(".", "undefined") names["."] = _merge_type[agg_type][row_type] else: for name, value in d.items(): agg_type = names.get(name, "undefined") if isinstance(value, list): if len(value)==0: this_type = "undefined" else: this_type=_type_to_name[value[0].__class__] if this_type=="object": this_type="nested" else: this_type = _type_to_name[value.__class__] new_type = _merge_type[agg_type][this_type] names[name] = new_type if this_type == "object": _get_schema_from_list([value], columns, prefix + [name], nested_path) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist([join_field(split_field(np[0])+[name])]+np) _get_schema_from_list(value, columns, prefix + [name], newpath) for n, t in names.items(): full_name = ".".join(prefix + [n]) column = Column( name=full_name, table=".", es_column=full_name, es_index=".", type=t, nested_path=nested_path ) columns.append(column)
def convert(self, expr): """ ADD THE ".$value" SUFFIX TO ALL VARIABLES """ if isinstance(expr, Expression): vars_ = expr.vars() rename = { v: join_field(split_field(v) + ["$value"]) for v in vars_ } return expr.map(rename) if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_keyword(expr): #TODO: LOOKUP SCHEMA AND ADD ALL COLUMNS WITH THIS PREFIX return expr + ".$value" elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, QueryOp): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({ name: self.convert(value) for name, value in expr.items() }) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return self.converter_map.get(k, self._convert_bop)(k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr])
def new_instance(type, frum, schema=None): """ Factory! """ if not type2container: _delayed_imports() if isinstance(frum, Container): return frum elif isinstance(frum, _Cube): return frum elif isinstance(frum, _Query): return _run(frum) elif isinstance(frum, (list, set, GeneratorType)): return _ListContainer(frum) elif isinstance(frum, basestring): # USE DEFAULT STORAGE TO FIND Container if not config.default.settings: Log.error( "expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info" ) settings = set_default( { "index": join_field(split_field(frum)[:1:]), "name": frum, }, config.default.settings) settings.type = None # WE DO NOT WANT TO INFLUENCE THE TYPE BECAUSE NONE IS IN THE frum STRING ANYWAY return type2container["elasticsearch"](settings) elif isinstance(frum, Mapping): frum = wrap(frum) if frum.type and type2container[frum.type]: return type2container[frum.type](frum.settings) elif frum["from"]: frum = copy(frum) frum["from"] = Container(frum["from"]) return _Query.wrap(frum) else: Log.error("Do not know how to handle {{frum|json}}", frum=frum) else: Log.error("Do not know how to handle {{type}}", type=frum.__class__.__name__)
def new_instance(type, frum, schema=None): """ Factory! """ if not type2container: _delayed_imports() if isinstance(frum, Container): return frum elif isinstance(frum, _Cube): return frum elif isinstance(frum, _Query): return _run(frum) elif isinstance(frum, (list, set, GeneratorType)): return _ListContainer(frum) elif isinstance(frum, basestring): # USE DEFAULT STORAGE TO FIND Container if not config.default.settings: Log.error("expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info") settings = set_default( { "index": join_field(split_field(frum)[:1:]), "name": frum, }, config.default.settings ) settings.type = None # WE DO NOT WANT TO INFLUENCE THE TYPE BECAUSE NONE IS IN THE frum STRING ANYWAY return type2container["elasticsearch"](settings) elif isinstance(frum, Mapping): frum = wrap(frum) if frum.type and type2container[frum.type]: return type2container[frum.type](frum.settings) elif frum["from"]: frum = copy(frum) frum["from"] = Container(frum["from"]) return _Query.wrap(frum) else: Log.error("Do not know how to handle {{frum|json}}", frum=frum) else: Log.error("Do not know how to handle {{type}}", type=frum.__class__.__name__)
def compile_expression(self, expression, constants=None): # EXPAND EXPRESSION WITH ANY CONSTANTS expression = setValues(expression, constants) fromPath = self.fromData.name # FIRST NAME IS THE INDEX indexName = join_field(split_field(fromPath)[:1:]) context = self.getFrameVariables(expression) if context == "": return addFunctions(expression).head+expression func = UID() code = addFunctions(context+expression) output = code.head + \ 'var ' + func + ' = function(' + indexName + '){\n' + \ context + \ expression + ";\n" + \ '};\n' + \ func + '(_source)\n' return Compiled(output)
def insert(self, docs): doc_collection = {} for d in docs: # ASSIGN A NON-NULL PRIMARY KEY if any(v == None for v in self.uid_accessor(d)): for u in self.uid: d[u] = coalesce(d[u], unique_name()) uid = wrap({u: d[u] for u in self.uid}) self.flatten(d, uid, doc_collection) for nested_path, insertion in doc_collection.items(): active_columns = list(insertion.active_columns) vals = [[quote_value(get_document_value(d, c)) for c in active_columns] for d in insertion.rows] command = "INSERT INTO " + quote_table(join_field(split_field(self.name)+split_field(nested_path[0]))) + "(" + \ ",".join(_quote_column(c) for c in active_columns) + \ ")\n" + \ " UNION ALL\n".join("SELECT " + ",".join(vv) for vv in vals) self.db.execute(command)
def compile_expression(self, expression, constants=None): # EXPAND EXPRESSION WITH ANY CONSTANTS expression = setValues(expression, constants) fromPath = self.fromData.name # FIRST NAME IS THE INDEX indexName = join_field(split_field(fromPath)[:1:]) context = self.getFrameVariables(expression) if context == "": return addFunctions(expression).head + expression func = UID() code = addFunctions(context + expression) output = code.head + \ 'var ' + func + ' = function(' + indexName + '){\n' + \ context + \ expression + ";\n" + \ '};\n' + \ func + '(_source)\n' return Compiled(output)
def insert(self, docs): doc_collection = {} for d in docs: # ASSIGN A NON-NULL PRIMARY KEY if any(v == None for v in self.uid_accessor(d)): for u in self.uid: d[u] = coalesce(d[u], unique_name()) uid = wrap({u: d[u] for u in self.uid}) self.flatten(d, uid, doc_collection) for nested_path, insertion in doc_collection.items(): active_columns = list(insertion.active_columns) vals = [[ quote_value(get_document_value(d, c)) for c in active_columns ] for d in insertion.rows] command = "INSERT INTO " + quote_table(join_field(split_field(self.name)+split_field(nested_path[0]))) + "(" + \ ",".join(_quote_column(c) for c in active_columns) + \ ")\n" + \ " UNION ALL\n".join("SELECT " + ",".join(vv) for vv in vals) self.db.execute(command)
def add_column(self, column): """ ADD COLUMN, IF IT DOES NOT EXIST ALREADY """ if column.name not in self.columns: self.columns[column.name] = {column} elif column.type not in [c.type for c in self.columns[column.name]]: self.columns[column.name].add(column) if column.type == "nested": nested_table_name = join_field( split_field(self.name) + split_field(column.name)) # MAKE THE TABLE table = Table_usingSQLite( nested_table_name, self.db, self.uid + [UID_PREFIX + "id" + unicode(len(self.uid))], exists=False) self.nested_tables[nested_table_name] = table else: self.db.execute("ALTER TABLE " + quote_table(self.name) + " ADD COLUMN " + _quote_column(column) + " " + column.type)
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table(name=short_name, url=None, query_path=None, timestamp=Date.now()) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now( ) - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): Log.note("waiting for columns to update {{columns|json}}", columns=[ c.table + "." + c.es_column for c in columns if not c.last_updated ]) Thread.sleep(seconds=1) return columns except Exception, e: Log.error("Not expected", cause=e)
def convert(self, expr): """ ADD THE ".$value" SUFFIX TO ALL VARIABLES """ if isinstance(expr, Expression): vars_ = expr.vars() rename = {v: join_field(split_field(v)+["$value"]) for v in vars_} return expr.map(rename) if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_keyword(expr): #TODO: LOOKUP SCHEMA AND ADD ALL COLUMNS WITH THIS PREFIX return expr + ".$value" elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, QueryOp): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({name: self.convert(value) for name, value in expr.items()}) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return self.converter_map.get(k, self._convert_bop)(k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr])
def typed_column(name, type): return join_field(split_field(name)+["$" + type])
def update(self, command): """ :param command: EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT """ command = wrap(command) # REJECT DEEP UPDATES touched_columns = command.set.keys() | set(listwrap(command["clear"])) for c in self.get_leaves(): if c.name in touched_columns and c.nested_path and len( c.name) > len(c.nested_path[0]): Log.error("Deep update not supported") # ADD NEW COLUMNS where = jx_expression(command.where) _vars = where.vars() _map = { v: c.es_column for v in _vars for c in self.columns.get(v, Null) if c.type not in ["nested", "object"] } where_sql = where.map(_map).to_sql() new_columns = set(command.set.keys()) - set(self.columns.keys()) for new_column_name in new_columns: nested_value = command.set[new_column_name] ctype = get_type(nested_value) column = Column(name=new_column_name, type=ctype, table=self.name, es_index=self.name, es_column=typed_column(new_column_name, ctype)) self.add_column(column) # UPDATE THE NESTED VALUES for nested_column_name, nested_value in command.set.items(): if get_type(nested_value) == "nested": nested_table_name = join_field( split_field(self.name) + split_field(nested_column_name)) nested_table = self.nested_tables[nested_table_name] self_primary_key = ",".join( quote_table(c.es_column) for u in self.uid for c in self.columns[u]) extra_key_name = UID_PREFIX + "id" + unicode(len(self.uid)) extra_key = [e for e in nested_table.columns[extra_key_name]][0] sql_command = "DELETE FROM " + quote_table(nested_table.name) + \ "\nWHERE EXISTS (" + \ "\nSELECT 1 " + \ "\nFROM " + quote_table(nested_table.name) + " n" + \ "\nJOIN (" + \ "\nSELECT " + self_primary_key + \ "\nFROM " + quote_table(self.name) + \ "\nWHERE " + where_sql + \ "\n) t ON " + \ " AND ".join( "t." + quote_table(c.es_column) + " = n." + quote_table(c.es_column) for u in self.uid for c in self.columns[u] ) + \ ")" self.db.execute(sql_command) # INSERT NEW RECORDS if not nested_value: continue doc_collection = {} for d in listwrap(nested_value): nested_table.flatten(d, Dict(), doc_collection, path=nested_column_name) prefix = "INSERT INTO " + quote_table(nested_table.name) + \ "(" + \ self_primary_key + "," + \ _quote_column(extra_key) + "," + \ ",".join( quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns ) + ")" # BUILD THE PARENT TABLES parent = "\nSELECT " + \ self_primary_key + \ "\nFROM " + quote_table(self.name) + \ "\nWHERE " + jx_expression(command.where).to_sql() # BUILD THE RECORDS children = " UNION ALL ".join( "\nSELECT " + quote_value(i) + " " + quote_table(extra_key.es_column) + "," + ",".join( quote_value(row[c.name]) + " " + quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns) for i, row in enumerate( doc_collection.get(".", Null).rows)) sql_command = prefix + \ "\nSELECT " + \ ",".join( "p." + quote_table(c.es_column) for u in self.uid for c in self.columns[u] ) + "," + \ "c." + _quote_column(extra_key) + "," + \ ",".join( "c." + quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns ) + \ "\nFROM (" + parent + ") p " + \ "\nJOIN (" + children + \ "\n) c on 1=1" self.db.execute(sql_command) # THE CHILD COLUMNS COULD HAVE EXPANDED # ADD COLUMNS TO SELF for n, cs in nested_table.columns.items(): for c in cs: column = Column(name=c.name, type=c.type, table=self.name, es_index=c.es_index, es_column=c.es_column, nested_path=[nested_column_name] + listwrap(c.nested_path)) if c.name not in self.columns: self.columns[column.name] = {column} elif c.type not in [ c.type for c in self.columns[c.name] ]: self.columns[column.name].add(column) command = "UPDATE " + quote_table(self.name) + " SET " + \ ",\n".join( [ _quote_column(c) + "=" + quote_value(get_if_type(v, c.type)) for k, v in command.set.items() if get_type(v) != "nested" for c in self.columns[k] if c.type != "nested" and not c.nested_path ] + [ _quote_column(c) + "=NULL" for k in listwrap(command["clear"]) if k in self.columns for c in self.columns[k] if c.type != "nested" and not c.nested_path ] ) + \ " WHERE " + where_sql self.db.execute(command)
def _flatten(d, uid, path, nested_path): insertion = doc_collection[ "." if not nested_path else nested_path[0]] row = uid.copy() insertion.rows.append(row) if isinstance(d, Mapping): for k, v in d.items(): cname = join_field(split_field(path) + [k]) ctype = get_type(v) if ctype is None: continue c = unwraplist([ c for c in self.columns.get(cname, Null) if c.type == ctype ]) if not c: c = Column(name=cname, table=self.name, type=ctype, es_column=typed_column(cname, ctype), es_index=self.name, nested_path=nested_path) self.add_column(c) insertion.active_columns.add(c) if ctype == "nested": row[cname] = "." deeper = [cname] + listwrap(nested_path) insertion = doc_collection.get(cname, None) if not insertion: doc_collection[cname] = Dict(active_columns=set(), rows=[]) for i, r in enumerate(v): child_uid = set_default( {UID_PREFIX + "id" + unicode(len(uid)): i}, uid) _flatten(r, child_uid, cname, deeper) elif ctype == "object": row[cname] = "." _flatten(v, cname, nested_path) elif c.type: row[cname] = v else: k = "." v = d cname = join_field(split_field(path) + [k]) ctype = get_type(v) if ctype is None: return c = unwraplist([ c for c in self.columns.get(cname, Null) if c.type == ctype ]) if not c: c = Column(name=cname, table=self.name, type=ctype, es_column=typed_column(cname, ctype), es_index=self.name, nested_path=nested_path) self.add_column(c) insertion.active_columns.add(c) if ctype == "nested": row[cname] = "." deeper = [cname] + listwrap(nested_path) insertion = doc_collection.get(cname, None) if not insertion: doc_collection[cname] = Dict(active_columns=set(), rows=[]) for i, r in enumerate(v): child_uid = set_default( {UID_PREFIX + "id" + unicode(len(uid)): i}, uid) _flatten(r, child_uid, cname, deeper) elif ctype == "object": row[cname] = "." _flatten(v, cname, nested_path) elif c.type: row[cname] = v
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) select = wrap([s.copy() for s in listwrap(query.select)]) new_select = DictList() columns = query.frum.get_columns() leaf_columns = set(c.name for c in columns if c.type not in STRUCT and (not c.nested_path or c.es_column == c.nested_path)) nested_columns = set(c.name for c in columns if c.nested_path) i = 0 source = "fields" for s in select: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(s.value, LeavesOp): term = s.value.term if isinstance(term, Variable): if term.var == ".": es_query.fields = None source = "_source" net_columns = leaf_columns - set(select.name) for n in net_columns: new_select.append({ "name": n, "value": Variable(n), "put": {"name": n, "index": i, "child": "."} }) i += 1 else: parent = term.var + "." prefix = len(parent) for c in leaf_columns: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": s.name + "." + c[prefix:], "value": Variable(c), "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var in nested_columns: es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 else: parent = s.value.var + "." prefix = len(parent) net_columns = [c for c in leaf_columns if c.startswith(parent)] if not net_columns: # LEAF if es_query.fields is not None: es_query.fields.append(s.value.var) new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) else: # LEAVES OF OBJECT for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": s.name, "value": Variable(n), "put": {"name": s.name, "index": i, "child": n[prefix:]} }) i += 1 else: es_query.script_fields[literal_field(s.name)] = {"script": s.value.to_ruby()} new_select.append({ "name": s.name, "pull": "fields." + literal_field(s.name), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value.var)) elif isinstance(n.value, Variable): n.pull = "fields." + literal_field(n.value.var) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def update(self, command): """ :param command: EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT """ command = wrap(command) # REJECT DEEP UPDATES touched_columns = command.set.keys() | set(listwrap(command["clear"])) for c in self.get_leaves(): if c.name in touched_columns and c.nested_path and len(c.name) > len(c.nested_path[0]): Log.error("Deep update not supported") # ADD NEW COLUMNS where = jx_expression(command.where) _vars = where.vars() _map = { v: c.es_column for v in _vars for c in self.columns.get(v, Null) if c.type not in ["nested", "object"] } where_sql = where.map(_map).to_sql() new_columns = set(command.set.keys()) - set(self.columns.keys()) for new_column_name in new_columns: nested_value = command.set[new_column_name] ctype = get_type(nested_value) column = Column( name=new_column_name, type=ctype, table=self.name, es_index=self.name, es_column=typed_column(new_column_name, ctype) ) self.add_column(column) # UPDATE THE NESTED VALUES for nested_column_name, nested_value in command.set.items(): if get_type(nested_value) == "nested": nested_table_name = join_field(split_field(self.name)+split_field(nested_column_name)) nested_table = self.nested_tables[nested_table_name] self_primary_key = ",".join(quote_table(c.es_column) for u in self.uid for c in self.columns[u]) extra_key_name = UID_PREFIX+"id"+unicode(len(self.uid)) extra_key = [e for e in nested_table.columns[extra_key_name]][0] sql_command = "DELETE FROM " + quote_table(nested_table.name) + \ "\nWHERE EXISTS (" + \ "\nSELECT 1 " + \ "\nFROM " + quote_table(nested_table.name) + " n" + \ "\nJOIN (" + \ "\nSELECT " + self_primary_key + \ "\nFROM " + quote_table(self.name) + \ "\nWHERE " + where_sql + \ "\n) t ON " + \ " AND ".join( "t." + quote_table(c.es_column) + " = n." + quote_table(c.es_column) for u in self.uid for c in self.columns[u] ) + \ ")" self.db.execute(sql_command) # INSERT NEW RECORDS if not nested_value: continue doc_collection = {} for d in listwrap(nested_value): nested_table.flatten(d, Dict(), doc_collection, path=nested_column_name) prefix = "INSERT INTO " + quote_table(nested_table.name) + \ "(" + \ self_primary_key + "," + \ _quote_column(extra_key) + "," + \ ",".join( quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns ) + ")" # BUILD THE PARENT TABLES parent = "\nSELECT " + \ self_primary_key + \ "\nFROM " + quote_table(self.name) + \ "\nWHERE " + jx_expression(command.where).to_sql() # BUILD THE RECORDS children = " UNION ALL ".join( "\nSELECT " + quote_value(i) + " " +quote_table(extra_key.es_column) + "," + ",".join( quote_value(row[c.name]) + " " + quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns ) for i, row in enumerate(doc_collection.get(".", Null).rows) ) sql_command = prefix + \ "\nSELECT " + \ ",".join( "p." + quote_table(c.es_column) for u in self.uid for c in self.columns[u] ) + "," + \ "c." + _quote_column(extra_key) + "," + \ ",".join( "c." + quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns ) + \ "\nFROM (" + parent + ") p " + \ "\nJOIN (" + children + \ "\n) c on 1=1" self.db.execute(sql_command) # THE CHILD COLUMNS COULD HAVE EXPANDED # ADD COLUMNS TO SELF for n, cs in nested_table.columns.items(): for c in cs: column = Column( name=c.name, type=c.type, table=self.name, es_index=c.es_index, es_column=c.es_column, nested_path=[nested_column_name]+listwrap(c.nested_path) ) if c.name not in self.columns: self.columns[column.name] = {column} elif c.type not in [c.type for c in self.columns[c.name]]: self.columns[column.name].add(column) command = "UPDATE " + quote_table(self.name) + " SET " + \ ",\n".join( [ _quote_column(c) + "=" + quote_value(get_if_type(v, c.type)) for k, v in command.set.items() if get_type(v) != "nested" for c in self.columns[k] if c.type != "nested" and not c.nested_path ] + [ _quote_column(c) + "=NULL" for k in listwrap(command["clear"]) if k in self.columns for c in self.columns[k] if c.type != "nested" and not c.nested_path ] ) + \ " WHERE " + where_sql self.db.execute(command)
def parse_properties(parent_index_name, parent_query_path, esProperties): """ RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT """ from pyLibrary.queries.meta import Column columns = DictList() for name, property in esProperties.items(): if parent_query_path: index_name, query_path = parent_index_name, join_field(split_field(parent_query_path) + [name]) else: index_name, query_path = parent_index_name, name if property.type == "nested" and property.properties: # NESTED TYPE IS A NEW TYPE DEFINITION # MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH self_columns = parse_properties(index_name, query_path, property.properties) for c in self_columns: c.nested_path = unwraplist([query_path] + listwrap(c.nested_path)) columns.extend(self_columns) columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type="nested", nested_path=query_path )) continue if property.properties: child_columns = parse_properties(index_name, query_path, property.properties) columns.extend(child_columns) columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type="source" if property.enabled == False else "object" )) if property.dynamic: continue if not property.type: continue if property.type == "multi_field": property.type = property.fields[name].type # PULL DEFAULT TYPE for i, (n, p) in enumerate(property.fields.items()): if n == name: # DEFAULT columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type=p.type )) else: columns.append(Column( table=index_name, es_index=index_name, name=query_path + "\\." + n, es_column=query_path + "\\." + n, type=p.type )) continue if property.type in ["string", "boolean", "integer", "date", "long", "double"]: columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type=property.type )) if property.index_name and name != property.index_name: columns.append(Column( table=index_name, es_index=index_name, es_column=query_path, name=query_path, type=property.type )) elif property.enabled == None or property.enabled == False: columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type="source" if property.enabled==False else "object" )) else: Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=query_path) return columns
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce( kwargs.depth, len(self.fields) - 1 if isinstance(self.fields, list) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "esfilter": v.esfilter, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.esfilter ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = DictList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name": part.name, "value": part.value, "esfilter": part.esfilter, "style": coalesce(part.style, part.parent.style), "weight": part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name": v.name, "value": v.value, "esfilter": v.esfilter, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) ] elif kwargs.depth == 1: partitions = DictList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name": join_field( split_field(subpart.parent.name) + [subpart.name]), "value": subpart.value, "esfilter": subpart.esfilter, "style": coalesce(subpart.style, subpart.parent.style), "weight": subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception, e: Log.error("", e)
def typed_column(name, type): return join_field(split_field(name) + ["$" + type])
def untyped_column(column_name): if "$" in column_name: return join_field(split_field(column_name)[:-1]) else: return column_name
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) select = wrap([s.copy() for s in listwrap(query.select)]) new_select = DictList() column_names = set(c.name for c in query.frum.get_columns() if c.type not in ["object"] and (not c.nested_path or c.abs_name == c.nested_path or not c.nested_path)) source = "fields" i = 0 for s in select: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if s.value == "*": es_query.fields = None source = "_source" net_columns = column_names - set(select.name) for n in net_columns: new_select.append({ "name": n, "value": n, "put": {"name": n, "index": i, "child": "."} }) i += 1 elif s.value == ".": es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value == "_id": new_select.append({ "name": s.name, "value": s.value, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and s.value.endswith(".*") and is_keyword(s.value[:-2]): parent = s.value[:-1] prefix = len(parent) for c in column_names: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": s.name + "." + c[prefix:], "value": c, "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and is_keyword(s.value): parent = s.value + "." prefix = len(parent) net_columns = [c for c in column_names if c.startswith(parent)] if not net_columns: if es_query.fields is not None: es_query.fields.append(s.value) new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) else: for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": s.name, "value": n, "put": {"name": s.name, "index": i, "child": n[prefix:]} }) i += 1 elif isinstance(s.value, list): Log.error("need an example") if es_query.fields is not None: es_query.fields.extend([v for v in s.value]) else: es_query.script_fields[literal_field(s.name)] = {"script": qb_expression(s.value).to_ruby()} new_select.append({ "name": s.name, "pull": "fields." + literal_field(s.name), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value)) else: n.pull = "fields." + literal_field(n.value) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.es_response_time = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @use_settings def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.lists import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) self.meta = Dict() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ListContainer( "meta.columns", [], wrap({c.name: c for c in column_columns})) self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return self.meta.tables.query( {"where": { "eq": { "name": table_name } }}) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = [ r for r in self.meta.columns.data if r.table == c.table and r.name == c.name ] if not existing_columns: self.meta.columns.add(c) Log.note("todo: {{table}}.{{column}}", table=c.table, column=c.es_column) self.todo.add(c) # MARK meta.columns AS DIRTY TOO cols = [ r for r in self.meta.columns.data if r.table == "meta.columns" ] for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical.relative and not c.relative: return # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS for key in Column.__slots__: canonical[key] = c[key] Log.note("todo: {{table}}.{{column}}", table=canonical.table, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None, metadata=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE if not metadata: metadata = self.default_es.get_metadata(force=True) def parse_all(please_stop): for abs_index, meta in jx.sort(metadata.indices.items(), { "value": 0, "sort": -1 }): if meta.index != abs_index: continue for _, properties in meta.mappings.items(): if please_stop: return self._parse_properties(abs_index, properties, meta) if table: for abs_index, meta in jx.sort(metadata.indices.items(), { "value": 0, "sort": -1 }): if table == meta.index: for _, properties in meta.mappings.items(): self._parse_properties(abs_index, properties, meta) return if table == abs_index: self._get_columns(table=meta.index, metadata=metadata) return else: self.parser = Thread.run("parse properties", parse_all) def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column. startswith("previous_values.cf_") and not r.es_index.startswith( "debug")) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() if query_path: c.table = c.es_index + "." + query_path.last() else: c.table = c.es_index with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) if query_path: c.table = alias + "." + query_path.last() else: c.table = alias self._upsert_column(c) # EACH query_path IS A LIST OF EVER-INCREASING PATHS THROUGH EACH NESTED LEVEL query_paths = wrap([[c.es_column] for c in abs_columns if c.type == "nested"]) for a, b in itertools.product(query_paths, query_paths): aa = a.last() bb = b.last() if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) < len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(0, aa) break query_paths.append([]) for c in abs_columns: # ADD RELATIVE COLUMNS full_path = listwrap(c.nested_path) abs_depth = len(full_path) abs_parent = coalesce(full_path.last(), "") for query_path in query_paths: rel_depth = len(query_path) # ABSOLUTE add_column(copy(c), query_path) cc = copy(c) cc.relative = True if not query_path: add_column(cc, query_path) continue rel_parent = query_path.last() if c.es_column.startswith(rel_parent + "."): cc.name = c.es_column[len(rel_parent) + 1:] add_column(cc, query_path) elif c.es_column == rel_parent: cc.name = "." add_column(cc, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent.startswith(abs_parent + "."): cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error") def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.as_dict()))) def get_columns(self, table_name, column_name=None, fail_when_not_found=False): """ RETURN METADATA COLUMNS """ try: with self.meta.columns.locker: columns = [ c for c in self.meta.columns.data if c.table == table_name and ( column_name is None or c.name == column_name) ] if columns: columns = jx.sort(columns, "name") if fail_when_not_found: # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all( columns.get("last_updated")): Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.table + "." + c.es_column for c in columns if not c.last_updated ]) Thread.sleep(seconds=1) return columns elif all(columns.get("last_updated")): return columns except Exception, e: Log.error("Not expected", cause=e) if fail_when_not_found: if column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name) else: self._get_columns(table=table_name) Log.error("no columns for {{table}}", table=table_name) self._get_columns(table=join_field(split_field(table_name)[0:1])) return self.get_columns(table_name=table_name, column_name=column_name, fail_when_not_found=True)
def _flatten(d, uid, path, nested_path): insertion = doc_collection["." if not nested_path else nested_path[0]] row = uid.copy() insertion.rows.append(row) if isinstance(d, Mapping): for k, v in d.items(): cname = join_field(split_field(path)+[k]) ctype = get_type(v) if ctype is None: continue c = unwraplist([c for c in self.columns.get(cname, Null) if c.type == ctype]) if not c: c = Column( name=cname, table=self.name, type=ctype, es_column=typed_column(cname, ctype), es_index=self.name, nested_path=nested_path ) self.add_column(c) insertion.active_columns.add(c) if ctype == "nested": row[cname] = "." deeper = [cname] + listwrap(nested_path) insertion = doc_collection.get(cname, None) if not insertion: doc_collection[cname] = Dict( active_columns=set(), rows=[] ) for i, r in enumerate(v): child_uid = set_default({UID_PREFIX+"id"+unicode(len(uid)): i}, uid) _flatten(r, child_uid, cname, deeper) elif ctype == "object": row[cname] = "." _flatten(v, cname, nested_path) elif c.type: row[cname] = v else: k="." v=d cname = join_field(split_field(path)+[k]) ctype = get_type(v) if ctype is None: return c = unwraplist([c for c in self.columns.get(cname, Null) if c.type == ctype]) if not c: c = Column( name=cname, table=self.name, type=ctype, es_column=typed_column(cname, ctype), es_index=self.name, nested_path=nested_path ) self.add_column(c) insertion.active_columns.add(c) if ctype == "nested": row[cname] = "." deeper = [cname] + listwrap(nested_path) insertion = doc_collection.get(cname, None) if not insertion: doc_collection[cname] = Dict( active_columns=set(), rows=[] ) for i, r in enumerate(v): child_uid = set_default({UID_PREFIX+"id"+unicode(len(uid)): i}, uid) _flatten(r, child_uid, cname, deeper) elif ctype == "object": row[cname] = "." _flatten(v, cname, nested_path) elif c.type: row[cname] = v
def __init__(self, dim, parent, qb): dim = wrap(dim) self.name = dim.name self.parent = coalesce(parent) self.full_name = join_field(split_field(self.parent.full_name)+[self.name]) self.edges = None # FOR NOW dot.set_default(self, dim) self.where = dim.where self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, qb.settings.index) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Dict() for e in listwrap(dim.edges): new_e = Dimension(e, self, qb) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif isinstance(fields, Mapping): self.fields = wrap(fields) edges = wrap([{"name": k, "value": v, "allowNulls": False} for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{"name": f, "value": f, "index": i, "allowNulls": False} for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if self.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH qb.get_columns() with Timer("Get parts of {{name}}", {"name": self.name}): parts = qb.query({ "from": self.index, "select": {"name": "count", "aggregate": "count"}, "edges": edges, "where": self.where, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name= self.name, num= len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Dict(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not isinstance(a, list): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts( temp, dim.path(d.getEnd(d.partitions[i])), count, 0 ) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif isinstance(fields, Mapping): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = DictList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "where": {"and": [ {"term": {e.value: g[e.name]}} for e in edges ]}, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values()[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if isinstance(fields, Mapping): output = Dict() for e, v in zip(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "where": {"and": [ {"term": {edges[0].value: d.partitions[i].value}}, {"term": {edges[1].value: d2.partitions[j].value}} ]}, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
def es_aggsop(es, frum, query): select = listwrap(query.select) es_query = Dict() new_select = Dict() formula = [] for s in select: if s.aggregate == "count" and (s.value == None or s.value == "."): s.pull = "doc_count" elif is_keyword(s.value): new_select[literal_field(s.value)] += [s] else: formula.append(s) for litral_field, many in new_select.items(): if len(many)>1: canonical_name=literal_field(many[0].name) es_query.aggs[canonical_name].stats.field = many[0].value for s in many: if s.aggregate == "count": s.pull = canonical_name + ".count" else: s.pull = canonical_name + "." + aggregates1_4[s.aggregate] else: s = many[0] s.pull = literal_field(s.value) + ".value" es_query.aggs[literal_field(s.value)][aggregates1_4[s.aggregate]].field = s.value for i, s in enumerate(formula): new_select[unicode(i)] = s s.pull = literal_field(s.name) + ".value" es_query.aggs[literal_field(s.name)][aggregates1_4[s.aggregate]].script = qb_expression_to_ruby(s.value) decoders = [AggsDecoder(e, query) for e in coalesce(query.edges, query.groupby, [])] start = 0 for d in decoders: es_query = d.append_query(es_query, start) start += d.num_columns if query.where: filter = simplify_esfilter(query.where) es_query = Dict( aggs={"_filter": set_default({"filter": filter}, es_query)} ) if len(split_field(frum.name)) > 1: es_query = wrap({ "size": 0, "aggs": {"_nested": set_default({ "nested": { "path": join_field(split_field(frum.name)[1::]) } }, es_query)} }) with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.es_response_time = es_duration.seconds output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format= query.format, cause=e) Log.error("Some problem", e)
def query_path(self): return join_field(split_field(self.name)[1:])
def es_aggsop(es, frum, query): select = listwrap(query.select) es_query = Dict() new_select = Dict() formula = [] for s in select: if s.aggregate == "count" and (s.value == None or s.value == "."): s.pull = "doc_count" elif is_keyword(s.value): new_select[literal_field(s.value)] += [s] else: formula.append(s) for litral_field, many in new_select.items(): if len(many) > 1: canonical_name = literal_field(many[0].name) es_query.aggs[canonical_name].stats.field = many[0].value for s in many: if s.aggregate == "count": s.pull = canonical_name + ".count" else: s.pull = canonical_name + "." + aggregates1_4[s.aggregate] else: s = many[0] s.pull = literal_field(s.value) + ".value" es_query.aggs[literal_field( s.value)][aggregates1_4[s.aggregate]].field = s.value for i, s in enumerate(formula): new_select[unicode(i)] = s s.pull = literal_field(s.name) + ".value" es_query.aggs[literal_field(s.name)][aggregates1_4[ s.aggregate]].script = qb_expression_to_ruby(s.value) decoders = [ AggsDecoder(e, query) for e in coalesce(query.edges, query.groupby, []) ] start = 0 for d in decoders: es_query = d.append_query(es_query, start) start += d.num_columns if query.where: filter = simplify_esfilter(query.where) es_query = Dict( aggs={"_filter": set_default({"filter": filter}, es_query)}) if len(split_field(frum.name)) > 1: es_query = wrap({ "size": 0, "aggs": { "_nested": set_default( { "nested": { "path": join_field(split_field(frum.name)[1::]) } }, es_query) } }) with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[ query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.es_response_time = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)
def __init__(self, dim, parent, qb): self.name = dim.name self.parent = parent self.full_name = join_field( split_field(self.parent.full_name) + [self.name]) dot.set_default(self, dim) self.esfilter = dim.esfilter self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, qb.es.settings.name) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Dict() for e in listwrap(dim.edges): new_e = Dimension(e, self, qb) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif isinstance(fields, Mapping): self.fields = wrap(fields) edges = wrap([{ "name": k, "value": v, "allowNulls": False } for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{ "name": f, "value": f, "index": i, "allowNulls": False } for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if dim.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH with Timer("Get parts of {{name}}", {"name": self.name}): parts = qb.query({ "from": self.index, "select": { "name": "count", "aggregate": "count" }, "edges": edges, "esfilter": self.esfilter, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name=self.name, num=len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Dict(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not isinstance(a, list): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts(temp, dim.path(d.getEnd(d.partitions[i])), count, 0) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif isinstance(fields, Mapping): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = DictList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "esfilter": { "and": [{ "term": { e.value: g[e.name] } } for e in edges] }, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "esfilter": { "term": { edges[0].value: d.partitions[i].value } }, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values( )[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if isinstance(fields, Mapping): output = Dict() for e, v in zip(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "esfilter": { "term": { edges[0].value: d.partitions[i].value } }, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "esfilter": { "and": [{ "term": { edges[0].value: d.partitions[i].value } }, { "term": { edges[1].value: d2.partitions[j].value } }] }, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) select = wrap([s.copy() for s in listwrap(query.select)]) new_select = DictList() columns = query.frum.get_columns() leaf_columns = set(c.name for c in columns if c.type not in ["object", "nested"] and (not c.nested_path or c.es_column == c.nested_path)) nested_columns = set(c.name for c in columns if c.nested_path) i = 0 source = "fields" for s in select: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(s.value, LeavesOp): if isinstance(s.value.term, Variable): if s.value.term.var == ".": es_query.fields = None source = "_source" net_columns = leaf_columns - set(select.name) for n in net_columns: new_select.append({ "name": n, "value": n, "put": {"name": n, "index": i, "child": "."} }) i += 1 else: parent = s.value.var + "." prefix = len(parent) for c in leaf_columns: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": s.name + "." + c[prefix:], "value": c, "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value.var, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value.var, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var in nested_columns: es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 else: parent = s.value.var + "." prefix = len(parent) net_columns = [c for c in leaf_columns if c.startswith(parent)] if not net_columns: # LEAF if es_query.fields is not None: es_query.fields.append(s.value.var) new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) else: # LEAVES OF OBJECT for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": s.name, "value": n, "put": {"name": s.name, "index": i, "child": n[prefix:]} }) i += 1 else: es_query.script_fields[literal_field(s.name)] = {"script": s.value.to_ruby()} new_select.append({ "name": s.name, "pull": "fields." + literal_field(s.name), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value)) else: n.pull = "fields." + literal_field(n.value) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def parse_columns(parent_path, esProperties): """ RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT """ columns = DictList() for name, property in esProperties.items(): if parent_path: path = join_field(split_field(parent_path) + [name]) else: path = name if property.type == "nested" and property.properties: # NESTED TYPE IS A NEW TYPE DEFINITION # MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH child_columns = deepcopy(parse_columns(path, property.properties)) self_columns = deepcopy(child_columns) for c in self_columns: c.depth += 1 columns.extend(self_columns) columns.append({ "name": join_field(split_field(path)[1::]), "type": "nested", "useSource": False }) if path not in INDEX_CACHE: pp = split_field(parent_path) for i in qb.reverse(range(len(pp))): c = INDEX_CACHE.get(join_field(pp[:i + 1]), None) if c: INDEX_CACHE[path] = c.copy() break else: Log.error("Can not find parent") INDEX_CACHE[path].name = path INDEX_CACHE[path].columns = child_columns continue if property.properties: child_columns = parse_columns(path, property.properties) columns.extend(child_columns) columns.append({ "name": join_field(split_field(path)[1::]), "type": "object", "useSource": False }) if property.dynamic: continue if not property.type: continue if property.type == "multi_field": property.type = property.fields[name].type # PULL DEFAULT TYPE for i, n, p in enumerate(property.fields): if n == name: # DEFAULT columns.append({ "name": join_field(split_field(path)[1::]), "type": p.type, "useSource": p.index == "no" }) else: columns.append({ "name": join_field(split_field(path)[1::]) + "\\." + n, "type": p.type, "useSource": p.index == "no" }) continue if property.type in [ "string", "boolean", "integer", "date", "long", "double" ]: columns.append({ "name": join_field(split_field(path)[1::]), "type": property.type, "useSource": property.index == "no" }) if property.index_name and name != property.index_name: columns.append({ "name": property.index_name, "type": property.type, "useSource": property.index == "no" }) elif property.enabled == None or property.enabled == False: columns.append({ "name": join_field(split_field(path)[1::]), "type": "object", "useSource": True }) else: Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=path) return columns