def test_val_to_json(): assert coerce_val_to_json(datetime(2010, 1, 1)) == "2010-01-01 00:00:00" assert coerce_val_to_json([1, 2, datetime(2010, 1, 1)]) == [1, 2, "2010-01-01 00:00:00"] assert coerce_val_to_json({"one": 1, "two": 2, "datetime": datetime(2010, 1, 1)}) == { "one": 1, "two": 2, "datetime": "2010-01-01 00:00:00", }
def _to_str(results: List[Tuple[Any]], use_json: bool = False) -> str: if use_json: import json from splitgraph.core.common import coerce_val_to_json return json.dumps(coerce_val_to_json(results)) from tabulate import tabulate return tabulate(results, tablefmt="plain")
def generate_range_index( object_engine: "PsycopgEngine", object_id: str, table_schema: "TableSchema", changeset: Optional[Changeset], columns: Optional[List[str]] = None, ) -> Dict[str, Tuple[T, T]]: """ Calculate the minimum/maximum values of every column in the object (including deleted values). :param object_engine: Engine the object is located on :param object_id: ID of the object. :param table_schema: Schema of the table :param changeset: Changeset (old values will be included in the index) :param columns: Columns to run the index on (default all) :return: Dictionary of {column: [min, max]} """ columns = columns if columns is not None else [ c.name for c in table_schema ] object_pk = [c.name for c in table_schema if c.is_pk] if not object_pk: object_pk = [ c.name for c in table_schema if c.pg_type in PG_INDEXABLE_TYPES ] column_types = {c.name: _strip_type_mod(c.pg_type) for c in table_schema} columns_to_index = [ c.name for c in table_schema if _strip_type_mod(c.pg_type) in PG_INDEXABLE_TYPES and ( c.is_pk or c.name in columns) ] logging.debug("Running range index on columns %s", columns_to_index) query = SQL("SELECT ") + SQL(",").join( SQL( _inject_collation("MIN({0}", column_types[c]) + "), " + _inject_collation("MAX({0}", column_types[c]) + ")").format(Identifier(c)) for c in columns_to_index) query += SQL(" FROM {}.{}").format(Identifier(SPLITGRAPH_META_SCHEMA), Identifier(object_id)) result = object_engine.run_sql(query, return_shape=ResultShape.ONE_MANY) index = { col: (cmin, cmax) for col, cmin, cmax in zip(columns_to_index, result[0::2], result[1::2]) } # Also explicitly store the ranges of composite PKs (since they won't be included # in the columns list) to be used for faster chunking/querying. if len(object_pk) > 1: # Add the PK to the same index dict but prefix it with a dollar sign so that # it explicitly doesn't clash with any other columns. index["$pk"] = extract_min_max_pks( object_engine, [object_id], object_pk, [column_types[c] for c in object_pk])[0] if changeset: # Expand the index ranges to include the old row values in this chunk. # Why is this necessary? Say we have a table of (key (PK), value) and a # query "value = 42". Say we have 2 objects: # # key | value # 1 | 42 # # key | value # 1 | 43 (UPDATED) # # If we don't include the old value that object 2 overwrote in the index, we'll disregard object 2 # when inspecting the index for that query (since there, "value" only spans [43, 43]) and give the # wrong answer (1, 42) even though we should give (1, 43). Similarly with deletes: if the index for # an object doesn't say "some of the values spanning this range are deleted in this chunk", # we won't fetch the object. # # See test_lq_qual_filtering for these test cases. # We don't need to do this for the PK since the PK is always specified in deletes. # For DELETEs, we put NULLs in the non-PK columns; make sure we ignore them here. for _, old_row, _ in changeset.values(): for col, val in old_row.items(): # Ignore columns that we aren't indexing because they have unsupported types. # Also ignore NULL values. if col not in columns_to_index or val is None: continue # The audit trigger stores the old row values as JSON so only supports strings and floats/ints. # Hence, we have to coerce them into the values returned by the index. val = adapt(val, column_types[col]) index[col] = (_min(index[col][0], val), _max(index[col][1], val)) range_index = { k: (coerce_val_to_json(v[0]), coerce_val_to_json(v[1])) for k, v in index.items() } return range_index