Example #1
0
def test_val_to_json():
    assert coerce_val_to_json(datetime(2010, 1, 1)) == "2010-01-01 00:00:00"
    assert coerce_val_to_json([1, 2, datetime(2010, 1, 1)]) == [1, 2, "2010-01-01 00:00:00"]
    assert coerce_val_to_json({"one": 1, "two": 2, "datetime": datetime(2010, 1, 1)}) == {
        "one": 1,
        "two": 2,
        "datetime": "2010-01-01 00:00:00",
    }
Example #2
0
def _to_str(results: List[Tuple[Any]], use_json: bool = False) -> str:
    if use_json:
        import json
        from splitgraph.core.common import coerce_val_to_json

        return json.dumps(coerce_val_to_json(results))

    from tabulate import tabulate

    return tabulate(results, tablefmt="plain")
Example #3
0
def generate_range_index(
    object_engine: "PsycopgEngine",
    object_id: str,
    table_schema: "TableSchema",
    changeset: Optional[Changeset],
    columns: Optional[List[str]] = None,
) -> Dict[str, Tuple[T, T]]:
    """
    Calculate the minimum/maximum values of every column in the object (including deleted values).

    :param object_engine: Engine the object is located on
    :param object_id: ID of the object.
    :param table_schema: Schema of the table
    :param changeset: Changeset (old values will be included in the index)
    :param columns: Columns to run the index on (default all)
    :return: Dictionary of {column: [min, max]}
    """
    columns = columns if columns is not None else [
        c.name for c in table_schema
    ]

    object_pk = [c.name for c in table_schema if c.is_pk]
    if not object_pk:
        object_pk = [
            c.name for c in table_schema if c.pg_type in PG_INDEXABLE_TYPES
        ]
    column_types = {c.name: _strip_type_mod(c.pg_type) for c in table_schema}
    columns_to_index = [
        c.name for c in table_schema
        if _strip_type_mod(c.pg_type) in PG_INDEXABLE_TYPES and (
            c.is_pk or c.name in columns)
    ]

    logging.debug("Running range index on columns %s", columns_to_index)
    query = SQL("SELECT ") + SQL(",").join(
        SQL(
            _inject_collation("MIN({0}", column_types[c]) + "), " +
            _inject_collation("MAX({0}", column_types[c]) +
            ")").format(Identifier(c)) for c in columns_to_index)
    query += SQL(" FROM {}.{}").format(Identifier(SPLITGRAPH_META_SCHEMA),
                                       Identifier(object_id))
    result = object_engine.run_sql(query, return_shape=ResultShape.ONE_MANY)
    index = {
        col: (cmin, cmax)
        for col, cmin, cmax in zip(columns_to_index, result[0::2],
                                   result[1::2])
    }
    # Also explicitly store the ranges of composite PKs (since they won't be included
    # in the columns list) to be used for faster chunking/querying.
    if len(object_pk) > 1:
        # Add the PK to the same index dict but prefix it with a dollar sign so that
        # it explicitly doesn't clash with any other columns.
        index["$pk"] = extract_min_max_pks(
            object_engine, [object_id], object_pk,
            [column_types[c] for c in object_pk])[0]
    if changeset:
        # Expand the index ranges to include the old row values in this chunk.
        # Why is this necessary? Say we have a table of (key (PK), value) and a
        # query "value = 42". Say we have 2 objects:
        #
        #   key | value
        #   1   | 42
        #
        #   key | value
        #   1   | 43   (UPDATED)
        #
        # If we don't include the old value that object 2 overwrote in the index, we'll disregard object 2
        # when inspecting the index for that query (since there, "value" only spans [43, 43]) and give the
        # wrong answer (1, 42) even though we should give (1, 43). Similarly with deletes: if the index for
        # an object doesn't say "some of the values spanning this range are deleted in this chunk",
        # we won't fetch the object.
        #
        # See test_lq_qual_filtering for these test cases.

        # We don't need to do this for the PK since the PK is always specified in deletes.

        # For DELETEs, we put NULLs in the non-PK columns; make sure we ignore them here.
        for _, old_row, _ in changeset.values():
            for col, val in old_row.items():
                # Ignore columns that we aren't indexing because they have unsupported types.
                # Also ignore NULL values.
                if col not in columns_to_index or val is None:
                    continue
                # The audit trigger stores the old row values as JSON so only supports strings and floats/ints.
                # Hence, we have to coerce them into the values returned by the index.
                val = adapt(val, column_types[col])
                index[col] = (_min(index[col][0],
                                   val), _max(index[col][1], val))
    range_index = {
        k: (coerce_val_to_json(v[0]), coerce_val_to_json(v[1]))
        for k, v in index.items()
    }
    return range_index