def compare_to_expected(query, result, expect, places): query = wrap(query) expect = wrap(expect) if result.meta.format == "table": try: assertAlmostEqual(set(result.header), set(expect.header)) except Exception as e: Log.error("format=table headers do not match", cause=e) # MAP FROM expected COLUMN TO result COLUMN mapping = transpose(*transpose(*filter( lambda v: v[0][1] == v[1][1], itertools.product(enumerate(expect.header), enumerate(result.header)) ))[1])[0] result.header = [result.header[m] for m in mapping] if result.data: columns = transpose(*unwrap(result.data)) result.data = transpose(*(columns[m] for m in mapping)) if not query.sort: sort_table(result) sort_table(expect) elif result.meta.format == "list": if not query.sort: try: # result.data MAY BE A LIST OF VALUES, NOT OBJECTS data_columns = jx.sort(set(jx.get_columns(result.data, leaves=True)) | set(jx.get_columns(expect.data, leaves=True)), "name") except Exception: data_columns = [{"name": "."}] sort_order = listwrap(coalesce(query.edges, query.groupby)) + data_columns if is_list(expect.data): try: expect.data = jx.sort(expect.data, sort_order.name) except Exception as _: pass if is_list(result.data): try: result.data = jx.sort(result.data, sort_order.name) except Exception as _: pass elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort: result_data, result_header = cube2list(result.data) result_header = map(literal_field, result_header) result_data = unwrap(jx.sort(result_data, result_header)) result.data = list2cube(result_data, result_header) expect_data, expect_header = cube2list(expect.data) expect_header = map(literal_field, expect_header) expect_data = jx.sort(expect_data, expect_header) expect.data = list2cube(expect_data, expect_header) # CONFIRM MATCH assertAlmostEqual(result, expect, places=places)
def compare_to_expected(query, result, expect): query = wrap(query) expect = wrap(expect) if result.meta.format == "table": assertAlmostEqual(set(result.header), set(expect.header)) # MAP FROM expected COLUMN TO result COLUMN mapping = list(zip(*list(zip(*filter( lambda v: v[0][1] == v[1][1], itertools.product(enumerate(expect.header), enumerate(result.header)) )))[1]))[0] result.header = [result.header[m] for m in mapping] if result.data: columns = list(zip(*unwrap(result.data))) result.data = zip(*[columns[m] for m in mapping]) if not query.sort: sort_table(result) sort_table(expect) elif result.meta.format == "list": if query["from"].startswith("meta."): pass else: query = QueryOp.wrap(query, query.frum, query.schema) if not query.sort: try: #result.data MAY BE A LIST OF VALUES, NOT OBJECTS data_columns = jx.sort(set(jx.get_columns(result.data, leaves=True)) | set(jx.get_columns(expect.data, leaves=True)), "name") except Exception as _: data_columns = [{"name": "."}] sort_order = listwrap(coalesce(query.edges, query.groupby)) + data_columns if isinstance(expect.data, list): try: expect.data = jx.sort(expect.data, sort_order.name) except Exception: pass if isinstance(result.data, list): try: result.data = jx.sort(result.data, sort_order.name) except Exception: pass elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort: result_data, result_header = cube2list(result.data) result_data = unwrap(jx.sort(result_data, result_header)) result.data = list2cube(result_data, result_header) expect_data, expect_header = cube2list(expect.data) expect_data = jx.sort(expect_data, expect_header) expect.data = list2cube(expect_data, expect_header) # CONFIRM MATCH assertAlmostEqual(result, expect, places=6)
def sort_table(result): """ SORT ROWS IN TABLE, EVEN IF ELEMENTS ARE JSON """ data = wrap([{text(i): v for i, v in enumerate(row) if v != None} for row in result.data]) sort_columns = jx.sort(set(jx.get_columns(data, leaves=True).name)) data = jx.sort(data, sort_columns) result.data = [tuple(row[text(i)] for i in range(len(result.header))) for row in data]
def test_sort_value(self): data = [4, 5, 3, 2, 1] result = jx.sort(data, {"value": ".", "sort": -1}) expected = [5, 4, 3, 2, 1] self.assertEqual(result, expected) result = jx.sort(data, ".") expected = [1, 2, 3, 4, 5] self.assertEqual(result, expected)
def _merge(*schemas): if len(schemas) == 1: return schemas[0] try: if any(NESTED_TYPE in s for s in schemas): # IF THERE ARE ANY ARRAYS, THEN THE MERGE IS AN ARRAY new_schemas = [] for schema in schemas: if NESTED_TYPE in schema: sub_schema = schema[NESTED_TYPE] residue = {k: v for k, v in schema.items() if k != NESTED_TYPE} new_schemas.append(_merge(sub_schema, residue)) else: new_schemas.append(schema) return {NESTED_TYPE: _merge(*new_schemas)} else: return OrderedDict( (k, _merge(*(ss for s in schemas for ss in [s.get(k)] if ss))) for k in jx.sort(set(k for s in schemas for k in s.keys())) ) except Exception as e: e = Except.wrap(e) if "Expecting types to match" in e: raise e t = list(set(schemas)) if len(t) == 1: return t[0] elif len(t) == 2 and STRING in t and NUMBER in t: return STRING else: Log.error("Expecting types to match {{types|json}}", types=t)
def insert_list(self, table_name, records): if not records: return columns = set() for r in records: columns |= set(r.keys()) columns = jx.sort(columns) try: self.execute( "DELETE FROM " + self.quote_column(table_name) + SQL_WHERE + "_id IN {{ids}}", {"ids": self.quote_column([r["_id"] for r in records])}) command = (SQL_INSERT + self.quote_column(table_name) + sql_iso(sql_list(self.quote_column(k) for k in columns)) + SQL_VALUES + sql_iso( sql_list( self.quote_value(r.get(k, None)) for k in columns for r in records))) self.execute(command) except Exception as e: Log.error("problem with insert", e)
def insert_list(self, table_name, records): if not records: return columns = set() for r in records: columns |= set(r.keys()) columns = jx.sort(columns) try: self.execute( "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}", {"ids": self.quote_column([r["_id"] for r in records])} ) command = ( "INSERT INTO " + self.quote_column(table_name) + "(" + ",".join([self.quote_column(k) for k in columns]) + ") VALUES " + ",\n".join([ sql_iso(",".join([self.quote_value(r.get(k, None)) for k in columns])) for r in records ]) ) self.execute(command) except Exception as e: Log.error("problem with insert", e)
def _update_meta(self): if not self.dirty: return now = Date.now() for mc in META_COLUMNS_DESC.columns: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.name] if value == None: pass else: count += 1 if is_list(value): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif is_data(value): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = now META_COLUMNS_DESC.last_updated = now self.dirty = False
def test_bulk_aggs_list(self): data = wrap([{"a": "test" + text(i)} for i in range(10111)]) expected = jx.sort([{"a": r.a, "count": 1} for r in data], "a") test = wrap({ "data": data, "query": { "from": TEST_TABLE, "groupby": "a", "limit": len(data), "chunk_size": 1000, "sort": "a", }, "expecting_list": { "data": expected[:MAX_LIMIT] }, # DUMMY, TO ENSURE LOADED }) self.utils.execute_tests(test) test.query.format = "list" test.query.destination = "url" result = http.post_json( url=self.utils.testing.query, json=test.query, ) self.assertEqual(result.meta.format, "list") @self.retry(result.url) def get_content(): content = http.get_json(result.url) self.assertEqual(content.meta.format, "list") sorted_content = jx.sort(content.data, "a") sorted_expected = jx.sort(expected, "a") self.assertEqual(sorted_content, sorted_expected)
def groupby(data, keys=None, contiguous=False): """ :param data: list of data to group :param keys: (list of) property path name :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES :return: return list of (keys, values) PAIRS, WHERE keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR values IS GENERATOR OF ALL VALUE THAT MATCH keys """ if isinstance(data, Container): return data.groupby(keys) try: if not data: return Null keys = listwrap(keys) if not contiguous: from jx_python import jx data = jx.sort(data, keys) if len(keys) == 0 or len(keys) == 1 and keys[0] == '.': return _groupby_value(data) if any(is_expression(k) for k in keys): raise Log.error("can not handle expressions") accessor = jx_expression_to_function(jx_expression( {"tuple": keys})) # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__ return _groupby_keys(data, keys, accessor) except Exception as e: Log.error("Problem grouping", cause=e)
def _update_meta(self): if not self.dirty: return for mcl in self.data.get("meta.columns").values(): for mc in mcl: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.name] if value == None: pass else: count += 1 if is_list(value): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif is_data(value): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = Date.now() self.dirty = False
def get_content(): content = http.get_json(result.url) self.assertEqual(content.header, ["a"]) self.assertEqual(content.meta.format, "table") sorted_content = jx.sort(content.data, 0) sorted_expected = [(row.a, ) for row in expected] self.assertEqual(sorted_content, sorted_expected)
def insert_list(self, table_name, records): if not records: return columns = set() for r in records: columns |= set(r.keys()) columns = jx.sort(columns) try: self.execute( "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}", {"ids": self.quote_column([r["_id"] for r in records])} ) command = ( "INSERT INTO " + self.quote_column(table_name) + "(" + ",".join([self.quote_column(k) for k in columns]) + ") VALUES " + ",\n".join([ sql_iso(",".join([self.quote_value(r.get(k, None)) for k in columns])) for r in records ]) ) self.execute(command) except Exception as e: Log.error("problem with insert", e)
def _update_meta(self): if not self.dirty: return for mcl in self.data.get("meta.columns").values(): for mc in mcl: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.names["."]] if value == None: pass else: count += 1 if isinstance(value, list): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif isinstance(value, Mapping): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = Date.now() self.dirty = False
def _edges_op(self, query, frum): query = query.copy() # WE WILL BE MARKING UP THE QUERY index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) outer_selects = [] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) frum_path = split_field(frum) base_table = join_field(frum_path[0:1]) path = join_field(frum_path[1:]) nest_to_alias = { nested_path: quote_column("__" + unichr(ord('a') + i) + "__") for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } schema = self.sf.tables[relative_field(frum, self.sf.fact)].schema tables = [] for n, a in nest_to_alias.items(): if startswith_field(path, n): tables.append({"nest": n, "alias": a}) tables = jx.sort(tables, {"value": {"length": "nest"}}) from_sql = quote_column(join_field([base_table] + split_field(tables[0].nest))) + tables[0].alias for previous, t in zip(tables, tables[1::]): from_sql += ( SQL_LEFT_JOIN + quote_column(concat_field(base_table, t.nest)) + t.alias + SQL_ON + join_column(t.alias, quoted_PARENT) + " = " + join_column(previous.alias, quoted_UID) ) main_filter = query.where.to_sql(schema, boolean=True)[0].sql.b # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH ons = [] join_types = [] wheres = [] null_ons = [EXISTS_COLUMN + SQL_IS_NULL] groupby = [] null_groupby = [] orderby = [] domains = [] select_clause = [SQL_ONE + EXISTS_COLUMN] + [quote_column(c.es_column) for c in self.sf.tables['.'].columns] for edge_index, query_edge in enumerate(query.edges): edge_alias = quote_column("e" + text_type(edge_index)) if query_edge.value: edge_values = [p for c in query_edge.value.to_sql(schema).sql for p in c.items()] elif not query_edge.value and any(query_edge.domain.partitions.where): case = SQL_CASE for pp, p in enumerate(query_edge.domain.partitions): w = p.where.to_sql(schema)[0].sql.b t = quote_value(pp) case += SQL_WHEN + w + SQL_THEN + t case += SQL_ELSE + SQL_NULL + SQL_END # quote value with length of partitions edge_values = [("n", case)] elif query_edge.range: edge_values = query_edge.range.min.to_sql(schema)[0].sql.items() + query_edge.range.max.to_sql(schema)[ 0].sql.items()
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: last_update = MAX([ self.es_cluster.index_last_updated[i] for i in self.index_to_alias.get_domain(alias) ]) table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc(name=alias, url=None, query_path=['.'], timestamp=Date.MIN) with self.meta.tables.locker: self.meta.tables.add(table) self._reload_columns(table) elif force or table.timestamp < last_update: self._reload_columns(table) columns = self.meta.columns.find(alias, column_name) columns = jx.sort(columns, "names.\\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: if len(columns) > 10: Log.note("waiting for {{num}} columns to update", num=len([ c for c in columns if not c.last_updated ])) else: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) return []
def done_count(self): columns = map(text_type, range(len(self.fields))) parts = wrap([{text_type(i): p for i, p in enumerate(part)} for part in set(self.parts)]) self.parts = None sorted_parts = jx.sort(parts, columns) self.edge.domain = self.domain = SimpleSetDomain( key="value", partitions=[{"value": tuple(v[k] for k in columns), "dataIndex": i} for i, v in enumerate(sorted_parts)] )
def done_count(self): columns = map(text_type, range(len(self.fields))) parts = wrap([{text_type(i): p for i, p in enumerate(part)} for part in set(self.parts)]) self.parts = None sorted_parts = jx.sort(parts, columns) self.edge.domain = self.domain = SimpleSetDomain( key="value", partitions=[{"value": tuple(v[k] for k in columns), "dataIndex": i} for i, v in enumerate(sorted_parts)] )
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ :param data: :param keys: :param size: :param min_size: :param max_size: :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES :return: return list of (keys, values) PAIRS, WHERE keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR values IS GENERATOR OF ALL VALUE THAT MATCH keys contiguous - """ if isinstance(data, Container): return data.groupby(keys) if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) try: keys = listwrap(keys) if not contiguous: from jx_python import jx data = jx.sort(data, keys) if not data: return Null if any(isinstance(k, Expression) for k in keys): Log.error("can not handle expressions") else: accessor = jx_expression_to_function(jx_expression({"tuple": keys})) # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__ def _output(): start = 0 prev = accessor(data[0]) for i, d in enumerate(data): curr = accessor(d) if curr != prev: group = {} for k, gg in zip(keys, prev): group[k] = gg yield Data(group), data[start:i:] start = i prev = curr group = {} for k, gg in zip(keys, prev): group[k] = gg yield Data(group), data[start::] return _output() except Exception as e: Log.error("Problem grouping", cause=e)
def _es_terms2(es, mvel, query): """ WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value """ # REQUEST VALUES IN FIRST DIMENSION q1 = query.copy() q1.edges = query.edges[0:1:] values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value select = listwrap(query.select) FromES = build_es_query(query) for s in select: for i, v in enumerate(values1): FromES.facets[s.name + "," + str(i)] = { "terms": { "field": query.edges[1].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter({"and": [ query.where, {"term": {query.edges[0].value: v}} ]}) } data = es_post(es, FromES, query.limit) # UNION ALL TERMS FROM SECOND DIMENSION values2 = set() for k, f in data.facets.items(): values2.update(f.terms.term) values2 = jx.sort(values2) term2index = {v: i for i, v in enumerate(values2)} query.edges[1].domain.partitions = FlatList([{"name": v, "value": v} for v in values2]) # MAKE CUBE output = {} dims = [len(values1), len(values2)] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): coord = facetName.split(",") s = [s for s in select if s.name == coord[0]][0] i1 = int(coord[1]) for term in facet.terms: i2 = term2index[term.term] output[s.name][(i1, i2)] = term[aggregates[s.aggregate]] cube = Cube(query.select, query.edges, output) cube.query = query return cube
def _es_terms2(es, mvel, query): """ WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value """ # REQUEST VALUES IN FIRST DIMENSION q1 = query.copy() q1.edges = query.edges[0:1:] values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value select = listwrap(query.select) FromES = build_es_query(query) for s in select: for i, v in enumerate(values1): FromES.facets[s.name + "," + str(i)] = { "terms": { "field": query.edges[1].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter({"and": [ query.where, {"term": {query.edges[0].value: v}} ]}) } data = es_post(es, FromES, query.limit) # UNION ALL TERMS FROM SECOND DIMENSION values2 = set() for k, f in data.facets.items(): values2.update(f.terms.term) values2 = jx.sort(values2) term2index = {v: i for i, v in enumerate(values2)} query.edges[1].domain.partitions = FlatList([{"name": v, "value": v} for v in values2]) # MAKE CUBE output = {} dims = [len(values1), len(values2)] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): coord = facetName.split(",") s = [s for s in select if s.name == coord[0]][0] i1 = int(coord[1]) for term in facet.terms: i2 = term2index[term.term] output[s.name][(i1, i2)] = term[aggregates[s.aggregate]] cube = Cube(query.select, query.edges, output) cube.query = query return cube
def save_money(self, remaining_budget, net_new_utility): remove_spot_requests = wrap([]) # FIRST CANCEL THE PENDING REQUESTS if remaining_budget < 0: requests = self._get_managed_spot_requests() for r in requests: if r.status.code in PENDING_STATUS_CODES | PROBABLY_NOT_FOR_A_WHILE | MIGHT_HAPPEN: remove_spot_requests.append(r.id) net_new_utility += self.settings.utility[ r.launch_specification.instance_type].utility remaining_budget += r.price instances = jx.sort(self.running_instances(), "markup.estimated_value") remove_list = wrap([]) for s in instances: if remaining_budget >= 0: break remove_list.append(s) net_new_utility += coalesce(s.markup.type.utility, 0) remaining_budget += coalesce(s.request.bid_price, s.markup.price_80, s.markup.current_price) if not remove_list: return remaining_budget, net_new_utility # SEND SHUTDOWN TO EACH INSTANCE Log.warning("Shutdown {{instances}} to save money!", instances=remove_list.id) if ALLOW_SHUTDOWN: for g, removals in jx.chunk(remove_list, size=20): for i, t in [(i, Thread.run("teardown " + i.id, self.instance_manager.teardown, i, please_stop=False)) for i in removals]: try: t.join() except Exception: Log.note("Problem with shutdown of {{id}}", id=i.id) remove_spot_requests.extend(remove_list.spot_instance_request_id) # TERMINATE INSTANCES self.ec2_conn.terminate_instances(instance_ids=remove_list.id) # TERMINATE SPOT REQUESTS self.ec2_conn.cancel_spot_instance_requests( request_ids=remove_spot_requests) return remaining_budget, net_new_utility
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field( query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table(name=es_index_name, url=None, query_path=['.'], timestamp=Date.now()) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now( ) - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name)
def json_schema_to_markdown(schema): from jx_python import jx def _md_code(code): return "`" + code + "`" def _md_italic(value): return "*" + value + "*" def _inner(schema, parent_name, indent): more_lines = [] for k, v in schema.items(): full_name = concat_field(parent_name, k) details = indent + "* " + _md_code(full_name) if v.type: details += " - " + _md_italic(v.type) else: Log.error("{{full_name}} is missing type", full_name=full_name) if v.description: details += " " + v.description more_lines.append(details) if v.type in ["object", "array", "nested"]: more_lines.extend( _inner(v.properties, full_name, indent + " ")) return more_lines lines = [] if schema.title: lines.append("#" + schema.title) lines.append(schema.description) lines.append("") for k, v in jx.sort(schema.properties.items(), 0): full_name = k if v.type in ["object", "array", "nested"]: lines.append("##" + _md_code(full_name) + " Property") if v.description: lines.append(v.description) lines.append("") if v.type in ["object", "array", "nested"]: lines.extend(_inner(v.properties, full_name, " ")) else: lines.append("##" + _md_code(full_name) + " (" + v.type + ")") if v.description: lines.append(v.description) return "\n".join(lines)
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: last_update = MAX([ self.es_cluster.index_last_updated[i] for i in self.index_to_alias.get_domain(alias) ]) table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc( name=alias, url=None, query_path=['.'], timestamp=Date.MIN ) with self.meta.tables.locker: self.meta.tables.add(table) self._reload_columns(table) elif force or table.timestamp < last_update: self._reload_columns(table) columns = self.meta.columns.find(alias, column_name) columns = jx.sort(columns, "names.\\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: if len(columns) > 10: Log.note("waiting for {{num}} columns to update", num=len([c for c in columns if not c.last_updated])) else: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) return []
def json_schema_to_markdown(schema): from jx_python import jx def _md_code(code): return "`"+code+"`" def _md_italic(value): return "*"+value+"*" def _inner(schema, parent_name, indent): more_lines = [] for k,v in schema.items(): full_name = concat_field(parent_name, k) details = indent+"* "+_md_code(full_name) if v.type: details += " - "+_md_italic(v.type) else: Log.error("{{full_name}} is missing type", full_name=full_name) if v.description: details += " " + v.description more_lines.append(details) if v.type in ["object", "array", "nested"]: more_lines.extend(_inner(v.properties, full_name, indent+" ")) return more_lines lines = [] if schema.title: lines.append("#"+schema.title) lines.append(schema.description) lines.append("") for k, v in jx.sort(schema.properties.items(), 0): full_name = k if v.type in ["object", "array", "nested"]: lines.append("##"+_md_code(full_name)+" Property") if v.description: lines.append(v.description) lines.append("") if v.type in ["object", "array", "nested"]: lines.extend(_inner(v.properties, full_name, " ")) else: lines.append("##"+_md_code(full_name)+" ("+v.type+")") if v.description: lines.append(v.description) return "\n".join(lines)
def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) domain = self.domain = edge.domain self.sorted = None self.pull = pull_functions[STRING] # WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM # self.sorted = {1: "asc", -1: "desc", None: None}[getattr(edge.domain, 'sort', None)] edge_var = set(v.var for v in edge.value.vars()) if query.sort: for s in query.sort: if not edge_var - set(v.var for v in s.value.vars()): self.sorted = {1: "asc", -1: "desc"}[s.sort] parts = jx.sort(domain.partitions, {"value": domain.key, "sort": s.sort}) edge.domain = self.domain = SimpleSetDomain(key=domain.key, label=domain.label, partitions=parts)
def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) domain = self.domain = edge.domain # WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM # self.sorted = {1: "asc", -1: "desc", None: None}[getattr(edge.domain, 'sort', None)] edge_var = edge.value.vars() if query.sort: for s in query.sort: if not edge_var - s.value.vars(): self.sorted = {1: "asc", -1: "desc"}[s.sort] parts = jx.sort(domain.partitions, {"value": domain.key, "sort": s.sort}) edge.domain = self.domain = SimpleSetDomain(key=domain.key, label=domain.label, partitions=parts) else: self.sorted = None
def running_instances(self): # FIND THE BIGGEST, MOST EXPENSIVE REQUESTS instances = self._get_managed_instances() for r in instances: try: r.markup = self.price_lookup[r.instance_type, r.placement] except Exception as e: r.markup = self.price_lookup[r.instance_type, r.placement] Log.error("No pricing!!!", e) instances = jx.sort(instances, [{ "value": "markup.type.utility", "sort": -1 }, { "value": "markup.estimated_value", "sort": 1 }]) return instances
def __exit__(self, exc_type, exc_val, exc_tb): ParserElement._parse = self.previous_parse profile = jx.sort( [{ "parser": text(parser), "cache_hits": cache, "matches": match, "failures": fail, "call_count": match + fail + cache, "total_parse": parse, "total_overhead": all - parse, "per_parse": parse / (match + fail), "per_overhead": (all - parse) / (match + fail + cache), } for parser, (cache, match, fail, parse, all) in timing.items()], {"total_parse": "desc"}, ) self.file.add_suffix(Date.now().format("%Y%m%d_%H%M%S")).write( convert.list2tab(profile))
def _merge(*schemas): if len(schemas) == 1: return schemas[0] try: return OrderedDict( (k, _merge(*[ss for s in schemas for ss in [s.get(k)] if ss])) for k in jx.sort(set(k for s in schemas for k in s.keys())) ) except Exception as e: e = Except.wrap(e) if "Expecting types to match" in e: raise e t = list(set(schemas)) if len(t) == 1: return t[0] elif len(t) == 2 and STRING in t and NUMBER in t: return STRING else: Log.error("Expecting types to match {{types|json}}", types=t)
def insert_list(self, table_name, records): if not records: return keys = set() for r in records: keys |= set(r.keys()) keys = jx.sort(keys) try: command = (SQL_INSERT + quote_column(table_name) + sql_iso(sql_list([quote_column(k) for k in keys])) + SQL_VALUES + sql_list( sql_iso(sql_list([quote_value(r[k]) for k in keys])) for r in records)) self.execute(command) except Exception as e: Log.error("problem with record: {{record}}", record=records, cause=e)
def insert_list(self, table_name, records): if not records: return keys = set() for r in records: keys |= set(r.keys()) keys = jx.sort(keys) try: command = ( "INSERT INTO " + quote_column(table_name) + sql_iso(sql_list([quote_column(k) for k in keys])) + " VALUES " + sql_list([ sql_iso(sql_list([quote_value(r[k]) for k in keys])) for r in records ]) ) self.execute(command) except Exception as e: Log.error("problem with record: {{record}}", record=records, cause=e)
def insert_list(self, table_name, records): if not records: return keys = set() for r in records: keys |= set(r.keys()) keys = jx.sort(keys) try: command = \ "INSERT INTO " + self.quote_column(table_name) + "(" + \ ",".join([self.quote_column(k) for k in keys]) + \ ") VALUES " + ",\n".join([ "(" + ",".join([self.quote_value(r[k]) for k in keys]) + ")" for r in records ]) self.execute(command) except Exception as e: Log.error("problem with record: {{record}}", record=records, cause=e)
def _schema_to_bq_schema(jx_path, es_path, schema): output = [] nt = schema.get(NESTED_TYPE) if nt: schema = {NESTED_TYPE: nt} for t, sub_schema in jx.sort(schema.items(), 0): bqt = typed_to_bq_type.get( t, {"field_type": "RECORD", "mode": "NULLABLE"} ) full_name = es_path + escape_name(t) top_field = self._top_level_fields.get(text(full_name)) if is_text(sub_schema): new_field_type = json_type_to_bq_type.get(sub_schema, sub_schema) if new_field_type != bqt["field_type"]: # OVERRIDE TYPE bqt = bqt.copy() bqt["field_type"] = new_field_type fields = () else: fields = _schema_to_bq_schema(jx_path + (t,), full_name, sub_schema) if top_field: if fields: Log.error("not expecting a structure") if self._partition.field == top_field: if bqt["field_type"] != "TIMESTAMP": Log.error("Partition field must be of time type") struct = SchemaField(name=top_field, fields=fields, **bqt) top_fields.append(struct) elif not fields and bqt["field_type"] == "RECORD": # THIS CAN HAPPEN WHEN WE MOVE A PRIMITIVE FIELD TO top_fields pass else: struct = SchemaField( name=text(escape_name(t)), fields=fields, **bqt ) output.append(struct) return output
def test_scroll_query_table(self): data = wrap([{"a": "test" + text(i)} for i in range(10111)]) expected = jx.sort(data, "a") test = wrap({ "data": data, "query": { "from": TEST_TABLE, "select": ["a"], "limit": len(data), "chunk_size": 10000, "sort": "a", }, "expecting_list": { "data": expected[:MAX_LIMIT] }, # DUMMY, TO ENSURE LOADED }) self.utils.execute_tests(test) test.query.format = "table" test.query.sort = None test.query.destination = "url" result = http.post_json( url=self.utils.testing.query, json=test.query, ) self.assertEqual(result.meta.format, "table") @self.retry(result.url) def get_content(): content = http.get_json(result.url) self.assertEqual(content.header, ["a"]) self.assertEqual(content.meta.format, "table") sorted_content = jx.sort(content.data, 0) sorted_expected = [(row.a, ) for row in expected] self.assertEqual(sorted_content, sorted_expected)
def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby( self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby( self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.abs_columns if cc.es_column == column.es_column and cc.type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = 1001 multi = 1001 elif column.es_column == "_id": result = self.default_es.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 else: result = self.default_es.post( "/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "size": 0 }) r = result.aggregations.count count = result.hits.total cardinality = coalesce(r.value, r._nested.value, r.doc_count) multi = coalesce(r.multi.value, 1) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=column.es_index, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=column.names["."], parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = any( column.es_index.startswith(t) for t in [TEST_TABLE_PREFIX, TEST_TABLE]) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR with self.meta.columns.locker: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "names.\\.": ".", "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
def done_count(self): self.edge.domain = self.domain = SimpleSetDomain( partitions=jx.sort(set(self.parts)) ) self.parts = None self.computed_domain = True
def get_columns(self, table_name, column_name=None, after=None, timeout=None): """ RETURN METADATA COLUMNS :param table_name: TABLE WE WANT COLUMNS FOR :param column_name: OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME :param timeout: Signal; True when should give up :return: """ DEBUG and after and Log.note("getting columns for after {{time}}", time=after) table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc( name=alias, url=None, query_path=["."], timestamp=Date.MIN ) with self.meta.tables.locker: self.meta.tables.add(table) columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") elif after or table.timestamp < self.es_cluster.metatdata_last_updated: columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") else: columns = self.meta.columns.find(alias, column_name) DEBUG and Log.note("columns from find()") DEBUG and Log.note("columns are {{ids}}", ids=[id(c) for c in columns]) columns = jx.sort(columns, "name") if after is None: return columns # DO NOT WAIT FOR COMPLETE COLUMNS # WAIT FOR THE COLUMNS TO UPDATE while True: pending = [c for c in columns if after >= c.last_updated or (c.cardinality == None and c.jx_type not in STRUCT)] if not pending: break if timeout: Log.error("trying to gets columns timed out") if DEBUG: if len(pending) > 10: Log.note("waiting for {{num}} columns to update by {{timestamp}}", num=len(pending), timestamp=after) else: Log.note("waiting for columns to update by {{timestamp}}; {{columns|json}}", timestamp=after, columns=[c.es_index + "." + c.es_column + " id="+text_type(id(c)) for c in pending]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Failure to get columns for {{table}}", table=table_name, cause=e) return []
def done_count(self): self.edge.allowNulls = False self.edge.domain = self.domain = SimpleSetDomain( partitions=jx.sort(set(self.parts)) ) self.parts = None
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not startswith_field(query['from'], self.sf.fact): Log.error("Expecting table, or some nested table") frum, query['from'] = query['from'], self table = self.sf.tables[relative_field(frum, self.sf.fact)] schema = table.schema query = QueryOp.wrap(query, table=table, schema=schema) new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_column(new_table) + " AS " else: create_table = "" if query.groupby and query.format != "cube": op, index_to_columns = self._groupby_op(query, frum) command = create_table + op elif query.groupby: query.edges, query.groupby = query.groupby, query.edges op, index_to_columns = self._edges_op(query, frum) command = create_table + op query.edges, query.groupby = query.groupby, query.edges elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, frum) command = create_table + op else: op = self._set_op(query, frum) return op result = self.db.query(command) if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap(s.pull(result.data[0])) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( data=unwrap(data), select=select, meta={"format": "cube"} ) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif isinstance(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data[s.name] = Matrix(dims=dims, zeros=0) else: data[s.name] = Matrix(dims=dims) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()} ) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif isinstance(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = zip(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} if query.sort[i].sort == -1: domain = SimpleSetDomain(partitions=wrap(sorted(parts, reverse=True))) else: domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data_cubes = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data_cubes[s.name] = Matrix(dims=dims, zeros=0) else: data_cubes[s.name] = Matrix(dims=dims) r2c = index_to_coordinate(dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull(row) if query.select == None: select = Null elif isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()} ) elif query.format == "table" or (not query.format and query.groupby): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data( meta={"format": "table"}, header=column_names, data=data ) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any(listwrap(query.select).aggregate): if isinstance(query.select, list): data = Data() for c in index_to_columns.values(): if c.push_child == ".": if data[c.push_name] == None: data[c.push_name] = c.pull(result.data[0]) elif isinstance(data[c.push_name], list): data[c.push_name].append(c.pull(result.data[0])) else: data[c.push_name] = [data[c.push_name], c.pull(result.data[0])] else: data[c.push_name][c.push_child] = c.pull(result.data[0]) output = Data( meta={"format": "value"}, data=data ) else: data = Data() for s in index_to_columns.values(): if not data[s.push_child]: data[s.push_child] = s.pull(result.data[0]) else: data[s.push_child] += [s.pull(result.data[0])] output = Data( meta={"format": "value"}, data=unwrap(data) ) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data( meta={"format": "list"}, data=data ) else: Log.error("unknown format {{format}}", format=query.format) return output
def int_list_packer(term, values): """ return singletons, ranges and exclusions """ DENSITY = 10 # a range can have holes, this is inverse of the hole density MIN_RANGE = 20 # min members before a range is allowed to be used singletons = set() ranges = [] exclude = set() sorted = jx.sort(values) last = sorted[0] curr_start = last curr_excl = set() for v in sorted[1::]: if v <= last + 1: pass elif v - last > 3: # big step, how do we deal with it? if last == curr_start: # not a range yet, so just add as singlton singletons.add(last) elif last - curr_start - len(curr_excl) < MIN_RANGE or ((last - curr_start) < len(curr_excl) * DENSITY): # small ranges are singletons, sparse ranges are singletons singletons |= set(range(curr_start, last + 1)) singletons -= curr_excl else: # big enough, and dense enough range ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl curr_start = v curr_excl = set() else: if 1 + last - curr_start >= len(curr_excl) * DENSITY: # high density, keep track of excluded and continue add_me = set(range(last + 1, v)) curr_excl |= add_me elif 1 + last - curr_start - len(curr_excl) < MIN_RANGE: # not big enough, convert range to singletons new_singles = set(range(curr_start, last + 1)) - curr_excl singletons = singletons | new_singles curr_start = v curr_excl = set() else: ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl curr_start = v curr_excl = set() last = v if last == curr_start: # not a range yet, so just add as singlton singletons.add(last) elif last - curr_start - len(curr_excl) < MIN_RANGE or ((last - curr_start) < len(curr_excl) * DENSITY): # small ranges are singletons, sparse ranges are singletons singletons |= set(range(curr_start, last + 1)) singletons -= curr_excl else: # big enough, and dense enough range ranges.append({"gte": curr_start, "lte": last}) exclude |= curr_excl if ranges: r = {"or": [{"range": {term: r}} for r in ranges]} if exclude: r = {"and": [r, {"not": {"terms": {term: jx.sort(exclude)}}}]} if singletons: return {"or": [ {"terms": {term: jx.sort(singletons)}}, r ]} else: return r else: return {"terms": {term: values}}
def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 multi = 1 else: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}} }, "size": 0 }) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)