Example #1
0
 def replace_agg_meta_stmt(self):
     return SQL('INSERT INTO {} VALUES (%s, %s)').format(
         Identifier(self.agg_meta_table))
Example #2
0
 def insert_data_query(self):
     return SQL('INSERT INTO {} VALUES (%s, %s, %s)').format(
         Identifier(self.data_table))
Example #3
0
 def update_topic_query(self):
     return SQL(
         'UPDATE {} SET topic_name = %s '
         'WHERE topic_id = %s').format(Identifier(self.topics_table))
Example #4
0
    def populateTableRoutesGeometry(self):
        print("Creating table routes geometry...")
        sql_insert = """INSERT INTO {}(route_id, trip_id, route_short_name, stops, stop_names, route_geom)
                 VALUES (%s, %s, %s, %s, %s, ST_GeomFromText('LINESTRING(%s)', 4326));
            """
        sql_insert = SQL(sql_insert).format(Identifier("routes_geometry_"+str(self.region)))

        sql_routes = """
                    SELECT route_id, route_short_name FROM {};
            """
        sql_routes = SQL(sql_routes).format(Identifier("routes_"+str(self.region)))

        sql_geometry = """
                SELECT ST_Intersects(ST_GeomFromText('LINESTRING(%s)', 4326),
                (SELECT polygon from {0} where level = (SELECT min(level) FROM {0})));
        """
        sql_geometry = SQL(sql_geometry).format(Identifier("neighborhoods_"+str(self.region)))

        try:

            cursor = self.conn.getCursor()
            cursor.execute(sql_routes)
            routes = {}

            row = cursor.fetchone()

            while row is not None:
                (route_id, route_short_name) = row
                routes[route_id] = route_short_name
                row = cursor.fetchone()

            sql_stops = """select st.stop_id, trip_id, s.stop_lat, s.stop_lon, s.stop_parent, s.stop_name
                        from {} st, {} s
                        where trip_id in
                        (select trip_id from {} where route_id = %s)
                        and st.stop_id = s.stop_id
                        ORDER BY trip_id, stop_sequence;
                        """
            sql_stops = SQL(sql_stops).format(Identifier("stop_times_"+str(self.region)),
                                              Identifier("stops_"+str(self.region)),
                                              Identifier("trips_"+str(self.region)))
            for route in routes:
                cursor.execute(sql_stops, (route, ))

                row = cursor.fetchone()
                trips_set = []
                trips_id = []
                route_stops = []
                stop_names = []

                trip_stops = {}
                trip_names = {}
                trip_geometry = {}

                previous_trip = -1

                geometry = ""
                while row is not None:
                    (stop_id, trip_id, lat, lon, parent, name) = row
                    if not parent:
                        #stop does not have a parent
                        parent = stop_id
                    if trip_id != previous_trip:
                        if previous_trip != -1:
                            route_set = set(route_stops)
                            res = self.checkNewTrip(trips_set, route_set)
                            if res >= -1:
                                trips_set.append(set(route_set))
                                trips_id.append(previous_trip)
                                trip_stops[previous_trip] = route_stops
                                trip_names[previous_trip] = stop_names
                            if res >= 0:
                                del trips_set[res]
                                del trips_id[res]

                            geometry = geometry[:-1]
                            trip_geometry[previous_trip] = geometry

                        geometry = ""
                        geometry += str(lon) + " " + str(lat) + ","
                        route_stops = [parent]
                        stop_names = [name]
                    else:
                        route_stops.append(parent)
                        stop_names.append(name)
                        geometry += str(lon) + " " + str(lat) + ","
                    previous_trip = trip_id
                    row = cursor.fetchone()

                route_set = set(route_stops)
                res = self.checkNewTrip(trips_set, route_set)
                if res >= -1:
                    trips_set.append(set(route_set))
                    trips_id.append(previous_trip)
                    trip_stops[previous_trip] = route_stops
                    trip_names[previous_trip] = stop_names
                if res >= 0:
                    del trips_set[res]
                    del trips_id[res]

                geometry = geometry[:-1]
                trip_geometry[previous_trip] = geometry

                for trip_id in trips_id:
                    geometry_trip = AsIs(trip_geometry[trip_id])
                    stops = trip_stops[trip_id]

                    cursor.execute(sql_geometry, (geometry_trip, ))
                    (intersects, ) = cursor.fetchone()

                    if intersects:
                        names = trip_names[trip_id]
                        cursor.execute(sql_insert, (route, trip_id, routes[route], stops, names, geometry_trip))
                        self.conn.commit()
            cursor.close()
        except IOError as e:
            print("I/O error({0}): {1}".format(e.errno, e.strerror))
        except (Exception, psycopg2.DatabaseError) as error:
            print(error)
        except:
            print("Unexpected error:", sys.exc_info()[0])
Example #5
0
    def search(self, category="", filters=[], types=[], keywords="", author=None, sort=[], projection=['id', 'title'], regex=False):
        """
        INPUT:

        - ``category`` -- a knowl category such as "ec"  or "mf".
        - ``filters`` -- a list, giving a subset of "beta", "reviewed", "in progress" and "deleted".
            Knowls in the returned list will have their most recent status among the provided values.
        - ``types`` -- a list, giving a subset of ["normal", "annotations"]
        - ``keywords`` -- a string giving a space separated list of lower case keywords from the id, title and content.  If regex is set, will be used instead as a regular expression to match against content, title and knowl id.
        - ``author`` -- a string or list of strings giving authors
        - ``sort`` -- a list of strings or pairs (x, dir) where x is a column name and dir is 1 or -1.
        - ``projection`` -- a list of column names, not including ``_keywords``
        - ``regex`` -- whether to use regular expressions rather than keyword search
        """
        restrictions = []
        values = []
        if category:
            restrictions.append(SQL("cat = %s"))
            values.append(category)
        if 'in progress' not in filters:
            restrictions.append(SQL("status != %s"))
            values.append(-1)
        if keywords:
            if regex:
                restrictions.append(SQL("content ~ %s OR title ~ %s OR id ~ %s"))
                values.extend([keywords, keywords, keywords])
            else:
                keywords = [w for w in keywords.split(" ") if len(w) >= 3]
                if keywords:
                    restrictions.append(SQL("_keywords @> %s"))
                    values.append(keywords)
        if author is not None:
            restrictions.append(SQL("authors @> %s"))
            values.append([author])
        # In order to be able to sort by arbitrary columns, we have to select everything here.
        # We therefore do the projection in Python, which is fine for the knowls table since it's tiny
        fields = ['id'] + self._default_fields
        sqlfields = SQL(", ").join(map(Identifier, fields))
        projfields = [(col, fields.index(col)) for col in projection]
        if restrictions:
            restrictions = SQL(" WHERE ") + SQL(" AND ").join(restrictions)
        else:
            restrictions = SQL("")
        selecter = SQL("SELECT DISTINCT ON (id) {0} FROM kwl_knowls{1} ORDER BY id, timestamp DESC").format(sqlfields, restrictions)
        secondary_restrictions = []
        if filters:
            secondary_restrictions.append(SQL("knowls.{0} = ANY(%s)").format(Identifier("status")))
            values.append([knowl_status_code[q] for q in filters if q in knowl_status_code])
        else:
            secondary_restrictions.append(SQL("status >= %s"))
            values.append(0)
        if not types:
            # default to just showing normal knowls
            types = ["normal"]
        if len(types) == 1:
            secondary_restrictions.append(SQL("type = %s"))
            values.append(knowl_type_code[types[0]])
        else:
            secondary_restrictions.append(SQL("type = ANY(%s)"))
            values.append([knowl_type_code[typ] for typ in types])
        secondary_restrictions = SQL(" AND ").join(secondary_restrictions)
        if sort:
            sort = SQL(" ORDER BY ") + self._sort_str(sort)
        else:
            sort = SQL("")
        selecter = SQL("SELECT {0} FROM ({1}) knowls WHERE {2}{3}").format(sqlfields, selecter, secondary_restrictions, sort)
        cur = self._execute(selecter, values)
        return [{k:res[i] for k,i in projfields} for res in cur]
Example #6
0
    def _chunk_table(
        self,
        repository: "Repository",
        source_schema: str,
        source_table: str,
        table_size: int,
        chunk_size: int,
        extra_indexes: Optional[ExtraIndexInfo] = None,
        table_schema: Optional[TableSchema] = None,
        in_fragment_order: Optional[List[str]] = None,
        overwrite: bool = False,
    ) -> List[str]:
        table_pk = [
            p[0] for p in self.object_engine.get_change_key(
                source_schema, source_table)
        ]
        table_schema = table_schema or self.object_engine.get_full_table_schema(
            source_schema, source_table)
        object_ids = []

        # We need to do multiple things here in a specific way to not tank the performance:
        #  * Chunk the table up ordering by PK (or potentially another chunk key in the future)
        #  * Run LTHash on added rows in every chunk
        #  * Copy each chunk into a CStore table (adding ranges/bloom filter intex to it
        #    in the metadata).
        #
        # Chunking is very slow to do with
        #   SELECT * FROM source LIMIT chunk_size OFFSET offset ORDER BY pk
        # as Postgres needs (even if there's an index on the PK) to go through first `offset` tuples
        # before finding out what it is it's supposed to copy. So for the full table, PG will do
        # 0 + chunk_size + 2 * chunk_size + ... + (no_chunks - 1) * chunk_size fetches
        # which is O(n^2).
        #
        # The second strategy here was recording the last PK we saw and then copying the next
        # fragment out of the table with SELECT ... WHERE pk > last_pk ORDER BY pk LIMIT chunk_size
        # but that still led to poor performance on large tables (8M rows, chunks of ~200k rows
        # would take 1 minute each to create).
        #
        # Third attempt was adding a temporary column to the source table using RANK () OVER
        #   (ORDER BY pk) but that required a join with a CTE and a couple of sequential
        # scans which also took more than 15 minutes on a 8M row table, no matter whether the
        # table had indexes on the join key.
        #
        # In the current setup, we compute the partition key and extract the table contents
        # into a TEMPORARY table, then create an index on that partition key, then copy data
        # out of it into CStore. The first part takes 50 seconds, the second takes 16 seconds
        # and after that extracting a chunk takes a few seconds.

        temp_table = "sg_tmp_partition_" + source_table
        chunk_id_col = "sg_tmp_partition_id"

        pk_sql = SQL(",").join(Identifier(p) for p in table_pk)
        # Example query: CREATE TEMPORARY TABLE sg_tmp_partition_table AS SELECT *,
        # RANK () OVER (ORDER BY pk) / chunk_size sg_tmp_partition_id FROM source_schema.table
        logging.info("Processing table %s", source_table)
        no_chunks = int(math.ceil(table_size / chunk_size))

        log_progress = _log_commit_progress(table_size, no_chunks)
        log_func = logging.info if log_progress else logging.debug

        log_func("Computing table partitions")
        tmp_table_query = (SQL(
            "CREATE TEMPORARY TABLE {} AS SELECT *, (ROW_NUMBER() OVER (ORDER BY "
        ).format(Identifier(temp_table)) + pk_sql +
                           SQL(") - 1) / %s {} FROM {}.{}").format(
                               Identifier(chunk_id_col),
                               Identifier(source_schema),
                               Identifier(source_table)))
        self.object_engine.run_sql(tmp_table_query, (chunk_size, ))

        log_func("Indexing the partition key")
        self.object_engine.run_sql(
            SQL("CREATE INDEX {} ON {}({})").format(
                Identifier("idx_" + temp_table), Identifier(temp_table),
                Identifier(chunk_id_col)))

        log_func("Storing and indexing the table")
        pbar = tqdm(
            range(0, no_chunks),
            unit="objs",
            total=no_chunks,
            ascii=SG_CMD_ASCII,
            disable=not log_progress,
        )

        for chunk_id in pbar:
            new_fragment = self.create_base_fragment(
                "pg_temp",
                temp_table,
                repository.namespace,
                chunk_id_col=chunk_id_col,
                chunk_id=chunk_id,
                extra_indexes=extra_indexes,
                table_schema=table_schema,
                in_fragment_order=in_fragment_order,
                overwrite=overwrite,
            )
            object_ids.append(new_fragment)

        # Temporary tables get deleted at the end of tx but sometimes we might run
        # multiple sg operations in the same transaction and clash.
        self.object_engine.delete_table("pg_temp", temp_table)
        return object_ids
Example #7
0
 def __init__(self, cr):
     self.name = str(uuid.uuid1())
     self._name = Identifier(self.name)
     self._cr = cr
     self.closed = False
     cr.execute(SQL('SAVEPOINT {}').format(self._name))
Example #8
0
def IdentifierWrapper(name, convert=True):
    """
    Returns a composable representing an SQL identifier.

    This is  wrapper for psycopg2.sql.Identifier that supports ARRAY slicers
    and coverts them (if desired) from the Python format to SQL,
    as SQL starts at 1, and it is inclusive at the end

    EXAMPLES::

        sage: IdentifierWrapper('name')
        Identifier('name')
        sage: print(IdentifierWrapper('name[:10]').as_string(db.conn))
        "name"[:10]
        sage: print(IdentifierWrapper('name[1:10]').as_string(db.conn))
        "name"[2:10]
        sage: print(IdentifierWrapper('name[1:10]', convert = False).as_string(db.conn))
        "name"[1:10]
        sage: print(IdentifierWrapper('name[1:10:3]').as_string(db.conn))
        "name"[2:10:3]
        sage: print(IdentifierWrapper('name[1:10:3][0:2]').as_string(db.conn))
        "name"[2:10:3][1:2]
        sage: print(IdentifierWrapper('name[1:10:3][0::1]').as_string(db.conn))
        "name"[2:10:3][1::1]
        sage: print(IdentifierWrapper('name[1:10:3][0]').as_string(db.conn))
        "name"[2:10:3][1]
    """
    if "[" not in name:
        return Identifier(name)
    else:
        i = name.index("[")
        knife = name[i:]
        name = name[:i]
        # convert python slicer to postgres slicer
        # SQL starts at 1, and it is inclusive at the end
        # so we just need to convert a:b:c -> a+1:b:c

        # first we remove spaces
        knife = knife.replace(" ", "")

        # assert that the knife is of the shape [*]
        if knife[0] != "[" or knife[-1] != "]":
            raise ValueError("%s is not in the proper format" % knife)
        chunks = knife[1:-1].split("][")
        # Prevent SQL injection
        if not all(
                all(x.isdigit() for x in chunk.split(":"))
                for chunk in chunks):
            raise ValueError("% is must be numeric, brackets and colons" %
                             knife)
        if convert:
            for i, s in enumerate(chunks):
                # each cut is of the format a:b:c
                # where a, b, c are either integers or empty strings
                split = s.split(":", 1)
                # nothing to adjust
                if split[0] == "":
                    continue
                else:
                    # we should increment it by 1
                    split[0] = str(int(split[0]) + 1)
                chunks[i] = ":".join(split)
            sql_slicer = "[" + "][".join(chunks) + "]"
        else:
            sql_slicer = knife

        return SQL("{0}{1}").format(Identifier(name), SQL(sql_slicer))
Example #9
0
def mount_hn(mountpoint: str,
             server,
             port,
             username,
             password,
             endpoints: Optional[List[str]] = None) -> None:
    """
    Mount a Hacker News story dataset using the Firebase API.
    \b
    :param endpoints: List of Firebase endpoints to mount, mounted into the same tables as
    the endpoint name. Supported endpoints: {top,new,best,ask,show,job}stories.
    """

    # A mount handler is a function that takes five arguments and any number
    # of optional ones. The mountpoint is always necessary and shows which foreign schema
    # to mount the dataset into. The connection parameters can be None.

    from splitgraph.core.table import create_foreign_table
    from splitgraph.engine import get_engine
    from psycopg2.sql import Identifier, SQL

    engine = get_engine()
    server_id = mountpoint + "_server"

    # Define server options that are common for all tables managed by this wrapper
    options: Dict[str, Optional[str]] = {
        # Module path to our foreign data wrapper class on the engine side
        "wrapper": "hn_fdw.fdw.HNForeignDataWrapper",
    }

    # Initialize the FDW: this will create the foreign server and user mappings behind the scenes.
    init_fdw(engine,
             server_id=server_id,
             wrapper="multicorn",
             server_options=options)

    # Create the schema that we'll be putting foreign tables into.
    engine.run_sql(
        SQL("CREATE SCHEMA IF NOT EXISTS {}").format(Identifier(mountpoint)))

    endpoints = endpoints or [
        "topstories",
        "newstories",
        "beststories",
        "askstories",
        "showstories",
        "jobstories",
    ]

    for endpoint in endpoints:
        # Generate SQL required to create a foreign table in the target schema.
        # create_foreign_table handles the boilerplate around running CREATE FOREIGN TABLE
        # on the engine and passing necessary arguments. The `internal_table_name`
        # argument becomes available as `fdw_options["table"]` argument on the FDW side,
        # but we can also write a custom CREATE FOREIGN TABLE statement to pass more arguments.
        logging.info("Mounting %s...", endpoint)
        sql, args = create_foreign_table(
            schema=mountpoint,
            server=server_id,
            table_name=endpoint,
            internal_table_name=endpoint,
            schema_spec=_story_schema_spec,
        )

        engine.run_sql(sql, args)

    # This was all run in a single SQL transaction so that everything gets rolled back
    # and we don't have to clean up in case of an error. We can commit it now.
    engine.commit()
Example #10
0
def mount_socrata(
    mountpoint: str,
    server,
    port,
    username,
    password,
    domain: str,
    tables: Optional[Dict[str, Any]] = None,
    app_token: Optional[str] = None,
    batch_size: Optional[int] = 10000,
) -> None:
    """
    Mount a Socrata dataset.

    Mounts a remote Socrata dataset and forwards queries to it
    \b

    :param domain: Socrata domain, for example, data.albanyny.gov. Required.
    :param tables: A dictionary mapping PostgreSQL table names to Socrata table IDs. For example,
        {"salaries": "xzkq-xp2w"}. If skipped, ALL tables in the Socrata endpoint will be mounted.
    :param app_token: Socrata app token. Optional.
    :param batch_size: Amount of rows to fetch from Socrata per request (limit parameter). Maximum 50000.
    """
    from splitgraph.engine import get_engine
    from sodapy import Socrata
    from psycopg2.sql import Identifier, SQL

    engine = get_engine()
    logging.info("Mounting Socrata domain...")
    server_id = mountpoint + "_server"

    options: Dict[str, Optional[str]] = {
        "wrapper":
        "splitgraph.ingestion.socrata.fdw.SocrataForeignDataWrapper",
    }

    if domain:
        options["domain"] = domain
    if app_token:
        options["app_token"] = app_token
    if batch_size:
        options["batch_size"] = str(batch_size)

    init_fdw(
        engine,
        server_id=server_id,
        wrapper="multicorn",
        server_options=options,
    )

    engine.run_sql(
        SQL("CREATE SCHEMA IF NOT EXISTS {}").format(Identifier(mountpoint)))

    logging.info("Getting Socrata metadata")
    client = Socrata(domain=domain, app_token=app_token)
    sought_ids = tables.values() if tables else []

    try:
        datasets = client.datasets(ids=sought_ids, only=["dataset"])
    except Exception as e:
        if "Unknown response format: text/html" in str(e):
            # If the Socrata dataset/domain isn't found, sodapy doesn't catch it directly
            # and instead stumbles on an unexpected content-type of the 404 page it's served.
            # We catch that and reraise a more friendly message.
            raise RepositoryNotFoundError(
                "Socrata domain or dataset not found!") from e
        raise

    if not datasets:
        raise RepositoryNotFoundError("Socrata domain or dataset not found!")

    mount_statements, mount_args = generate_socrata_mount_queries(
        sought_ids, datasets, mountpoint, server_id, tables)

    engine.run_sql(SQL(";").join(mount_statements), mount_args)
Example #11
0
 def to_sql(self):
     if self.column:
         column_sql = SQL('.{}').format(Identifier(self.column))
     else:
         column_sql = SQL('')
     return (SQL('{}{}').format(Identifier(self.alias), column_sql), ())
Example #12
0
def genDiagCount(logger, filePath):
    '''

    This function generates the percentage of users per race that has a certain diagnosis

    Decorators:
        lD.log

    Arguments:
        logger {logging.Logger}
        filePath {str}

    Returns:
        dict -- dictionary containing the results
    '''
    try:
        resultsDict = {
            "mood": [],
            "anxiety": [],
            "adjustment": [],
            "adhd": [],
            "sud": [],
            "psyc": [],
            "pers": [],
            "childhood": [],
            "impulse": [],
            "cognitive": [],
            "eating": [],
            "smtf": [],
            "disso": [],
            "sleep": [],
            "fd": []
        }

        with open(filePath) as json_file:
            table1results = json.load(json_file)

        for category in resultsDict:
            for race in fig1_config["inputs"]["races"]:
                query = SQL('''
                SELECT count(*)
                FROM tejas.restofusers t1
                INNER JOIN tejas.race_age_t1new t2
                ON t1.siteid = t2.siteid
                AND t1.backgroundid = t2.backgroundid
                WHERE t1.{} is true
                AND t2.race = {}
                ''').format(
                    Identifier(category),
                    Literal(race)
                )
                data = [d[0] for d in pgIO.getAllData(query)]
                data = round((data[0]/table1results[race])*100, 1)
                resultsDict[category].append(data) #percentages

        json_file.close()

    except Exception as e:
        logger.error('Failed to generate count {}'.format(e))

    return resultsDict
Example #13
0
def drop_enumerator(name, if_exists=True):
    sql = SQL("DROP TYPE {} {}").format(
        SQL("if exists" if if_exists else ""), Identifier(name)
    )  # CASCADE drops dependent columns which is too drastic, so we better don't use it

    database.execute_sql(sql)
Example #14
0
 def insert_aggregate_stmt(self, table_name):
     return SQL('INSERT INTO {} VALUES (%s, %s, %s, %s)').format(
         Identifier(table_name))
Example #15
0
    def create_base_fragment(
        self,
        source_schema: str,
        source_table: str,
        namespace: str,
        chunk_id_col: Optional[str] = None,
        chunk_id: Optional[int] = None,
        extra_indexes: Optional[ExtraIndexInfo] = None,
        in_fragment_order: Optional[List[str]] = None,
        overwrite: bool = False,
        table_schema: Optional[TableSchema] = None,
    ) -> str:
        if source_schema == "pg_temp" and not table_schema:
            raise ValueError("Cannot infer the schema of temporary tables, "
                             "pass in table_schema!")

        # Get schema (apart from the chunk ID column)
        # Fragments can't be reused in tables with different schemas
        # even if the contents match (e.g. '1' vs 1). Hence, include the table schema
        # n the object ID as well.
        table_schema = table_schema or [
            c for c in self.object_engine.get_full_table_schema(
                source_schema, source_table) if c.name != chunk_id_col
        ]

        schema_hash = self._calculate_schema_hash(table_schema)
        # Get content hash for this chunk.
        content_hash, rows_inserted = self.calculate_content_hash(
            source_schema,
            source_table,
            table_schema,
            chunk_id_col=chunk_id_col,
            chunk_id=chunk_id)

        # Object IDs are also used to key tables in Postgres so they can't be more than 63 characters.
        # In addition, table names can't start with a number (they can but every invocation has to
        # be quoted) so we have to drop 2 characters from the 64-character hash and append an "o".
        object_id = "o" + sha256(
            (content_hash + schema_hash).encode("ascii")).hexdigest()[:-2]

        with self.object_engine.savepoint("object_rename"):
            # Store the object adding the extra update/delete column (always True in this case
            # since we don't overwrite any rows) and filtering on the chunk ID.

            source_query = (
                SQL("SELECT ") +
                SQL(",").join(Identifier(c.name)
                              for c in table_schema) + SQL(",TRUE AS ") +
                Identifier(SG_UD_FLAG) + SQL("FROM {}.{}").format(
                    Identifier(source_schema), Identifier(source_table)))
            source_query_args = []

            if chunk_id_col:
                source_query += SQL("WHERE {} = %s").format(
                    Identifier(chunk_id_col))
                source_query_args = [chunk_id]

            if in_fragment_order:
                source_query += SQL(" ") + self._get_order_by_clause(
                    in_fragment_order, table_schema)
            try:
                self.object_engine.store_object(
                    object_id=object_id,
                    source_query=source_query,
                    schema_spec=add_ud_flag_column(table_schema),
                    source_query_args=source_query_args,
                    overwrite=overwrite,
                )
            except UniqueViolation:
                # Someone registered this object (perhaps a concurrent pull) already.
                logging.info(
                    "Object %s for table %s/%s already exists, continuing...",
                    object_id,
                    source_schema,
                    source_table,
                )
        with self.metadata_engine.savepoint("object_register"):
            try:
                self._register_object(
                    object_id,
                    namespace=namespace,
                    insertion_hash=content_hash,
                    deletion_hash="0" * 64,
                    table_schema=table_schema,
                    extra_indexes=extra_indexes,
                    rows_inserted=rows_inserted,
                    rows_deleted=0,
                )
            except UniqueViolation:
                # Someone registered this object (perhaps a concurrent pull) already.
                logging.info(
                    "Object %s for table %s/%s already exists, continuing...",
                    object_id,
                    source_schema,
                    source_table,
                )

        return object_id
Example #16
0
    def create_table_like(self, new_name, table, data=False, commit=True):
        """
        Copies the schema from an existing table, but none of the data, indexes or stats.

        INPUT:

        - ``new_name`` -- a string giving the desired table name.
        - ``table`` -- a string or PostgresSearchTable object giving an existing table.
        """
        if isinstance(table, string_types):
            table = self[table]
        search_columns = {
            typ: [col for col in table.search_cols if table.col_type[col] == typ]
            for typ in set(table.col_type.values())
        }
        extra_columns = {
            typ: [col for col in table.extra_cols if table.col_type[col] == typ]
            for typ in set(table.col_type.values())
        }
        # Remove empty lists
        for D in [search_columns, extra_columns]:
            for typ, cols in list(D.items()):
                if not cols:
                    D.pop(typ)
        if not extra_columns:
            extra_columns = extra_order = None
        else:
            extra_order = table.extra_cols
        label_col = table._label_col
        sort = table._sort_orig
        id_ordered = table._id_ordered
        search_order = table.search_cols
        self.create_table(
            new_name,
            search_columns,
            label_col,
            sort,
            id_ordered,
            extra_columns,
            search_order,
            extra_order,
            commit=commit,
        )
        if data:
            cols = SQL(", ").join(map(Identifier, ["id"] + table.search_cols))
            self._execute(
                SQL("INSERT INTO {0} ( {1} ) SELECT {1} FROM {2}").format(
                    Identifier(new_name), cols, Identifier(table.search_table)
                ),
                commit=commit,
            )
            if extra_columns:
                extra_cols = SQL(", ").join(map(Identifier, ["id"] + table.extra_cols))
                self._execute(
                    SQL("INSERT INTO {0} ( {1} ) SELECT {1} FROM {2}").format(
                        Identifier(new_name + "_extras"), extra_cols,
                        Identifier(table.extra_table)
                    ),
                    commit=commit,
                )
            self[new_name].stats.refresh_stats()
Example #17
0
    def record_table_as_base(
        self,
        repository: "Repository",
        table_name: str,
        image_hash: str,
        chunk_size: Optional[int] = 10000,
        source_schema: Optional[str] = None,
        source_table: Optional[str] = None,
        extra_indexes: Optional[ExtraIndexInfo] = None,
        in_fragment_order: Optional[List[str]] = None,
        overwrite: bool = False,
    ) -> List[str]:
        """
        Copies the full table verbatim into one or more new base fragments and registers them.

        :param repository: Repository
        :param table_name: Table name
        :param image_hash: Hash of the new image
        :param chunk_size: If specified, splits the table into multiple objects with a given number of rows
        :param source_schema: Override the schema the source table is stored in
        :param source_table: Override the name of the table the source is stored in
        :param extra_indexes: Dictionary of {index_type: column: index_specific_kwargs}.
        :param in_fragment_order: Key to sort data inside each chunk by.
        :param overwrite: Overwrite physical objects that already exist.
        """
        source_schema = source_schema or repository.to_schema()
        source_table = source_table or table_name

        table_size = self.object_engine.run_sql(
            SQL("SELECT COUNT (1) FROM {}.{}").format(
                Identifier(source_schema), Identifier(source_table)),
            return_shape=ResultShape.ONE_ONE,
        )

        table_schema = self.object_engine.get_full_table_schema(
            source_schema, source_table)
        if chunk_size and table_size:
            object_ids = self._chunk_table(
                repository,
                source_schema,
                source_table,
                table_size,
                chunk_size,
                extra_indexes,
                in_fragment_order=in_fragment_order,
                overwrite=overwrite,
            )

        elif table_size:
            object_ids = [
                self.create_base_fragment(
                    source_schema,
                    source_table,
                    repository.namespace,
                    extra_indexes=extra_indexes,
                    table_schema=table_schema,
                    in_fragment_order=in_fragment_order,
                    overwrite=overwrite,
                )
            ]
        else:
            # If table_size == 0, then we don't link it to any objects and simply store its schema
            object_ids = []
        self.register_tables(
            repository, [(image_hash, table_name, table_schema, object_ids)])
        return object_ids
Example #18
0
    def create_table(
        self,
        name,
        search_columns,
        label_col,
        sort=None,
        id_ordered=None,
        extra_columns=None,
        search_order=None,
        extra_order=None,
        commit=True,
    ):
        """
        Add a new search table to the database.  See also `create_table_like`.

        INPUT:

        - ``name`` -- the name of the table, which must include an underscore.  See existing names for consistency.
        - ``search_columns`` -- a dictionary whose keys are valid postgres types and whose values
            are lists of column names (or just a string if only one column has the specified type).
            An id column of type bigint will be added as a primary key (do not include it).
        - ``label_col`` -- the column holding the LMFDB label.  This will be used in the ``lookup`` method
            and in the display of results on the API.  Use None if there is no appropriate column.
        - ``sort`` -- If not None, provides a default sort order for the table, in formats accepted by
            the ``_sort_str`` method.
        - ``id_ordered`` -- boolean (default None).  If set, the table will be sorted by id when
            pushed to production, speeding up some kinds of search queries.  Defaults to True
            when sort is not None.
        - ``extra_columns`` -- a dictionary in the same format as the search_columns dictionary.
            If present, will create a second table (the name with "_extras" appended), linked by
            an id column.  Data in this table cannot be searched on, but will also not appear
            in the search table, speeding up scans.
        - ``search_order`` -- (optional) list of column names, specifying the default order of columns
        - ``extra_order`` -- (optional) list of column names, specifying the default order of columns

        COMMON TYPES:

        The postgres types most commonly used in the lmfdb are:

        - smallint -- a 2-byte signed integer.
        - integer -- a 4-byte signed integer.
        - bigint -- an 8-byte signed integer.
        - numeric -- exact, high precision integer or decimal.
        - real -- a 4-byte float.
        - double precision -- an 8-byte float.
        - text -- string (see collation note above).
        - boolean -- true or false.
        - jsonb -- data iteratively built from numerics, strings, booleans, nulls, lists and dictionaries.
        - timestamp -- 8-byte date and time with no timezone.
        """
        if name in self.tablenames:
            raise ValueError("%s already exists" % name)
        now = time.time()
        if id_ordered is None:
            id_ordered = sort is not None
        for typ, L in list(search_columns.items()):
            if isinstance(L, string_types):
                search_columns[typ] = [L]
        valid_list = sum(search_columns.values(), [])
        valid_set = set(valid_list)
        # Check that columns aren't listed twice
        if len(valid_list) != len(valid_set):
            C = Counter(valid_list)
            raise ValueError("Column %s repeated" % (C.most_common(1)[0][0]))
        # Check that label_col is valid
        if label_col is not None and label_col not in valid_set:
            raise ValueError("label_col must be a search column")
        # Check that sort is valid
        if sort is not None:
            for col in sort:
                if isinstance(col, tuple):
                    if len(col) != 2:
                        raise ValueError("Sort terms must be either strings or pairs")
                    if col[1] not in [1, -1]:
                        raise ValueError("Sort terms must be of the form (col, 1) or (col, -1)")
                    col = col[0]
                if col not in valid_set:
                    raise ValueError("Column %s does not exist" % (col))
        # Check that search order is valid
        if search_order is not None:
            for col in search_order:
                if col not in valid_set:
                    raise ValueError("Column %s does not exist" % (col))
            if len(search_order) != len(valid_set):
                raise ValueError("Must include all columns")

        def process_columns(coldict, colorder):
            allcols = {}
            hasid = False
            dictorder = []
            for typ, cols in coldict.items():
                self._check_col_datatype(typ)
                if isinstance(cols, string_types):
                    cols = [cols]
                for col in cols:
                    if col == "id":
                        hasid = True
                    # We have whitelisted the types, so it's okay to use string formatting
                    # to insert them into the SQL command.
                    # This is useful so that we can specify the collation in the type
                    allcols[col] = SQL("{0} " + typ).format(Identifier(col))
                    dictorder.append(col)
            allcols = [allcols[col] for col in (dictorder if colorder is None else colorder)]
            if not hasid:
                allcols.insert(0, SQL("id bigint"))
            return allcols

        processed_search_columns = process_columns(search_columns, search_order)
        with DelayCommit(self, commit, silence=True):
            creator = SQL("CREATE TABLE {0} ({1})").format(
                Identifier(name), SQL(", ").join(processed_search_columns)
            )
            self._execute(creator)
            self.grant_select(name)
            if extra_columns is not None:
                valid_extra_list = sum(extra_columns.values(), [])
                valid_extra_set = set(valid_extra_list)
                # Check that columns aren't listed twice
                if len(valid_extra_list) != len(valid_extra_set):
                    C = Counter(valid_extra_list)
                    raise ValueError("Column %s repeated" % (C.most_common(1)[0][0]))
                if extra_order is not None:
                    for col in extra_order:
                        if col not in valid_extra_set:
                            raise ValueError("Column %s does not exist" % (col))
                    if len(extra_order) != len(valid_extra_set):
                        raise ValueError("Must include all columns")
                processed_extra_columns = process_columns(extra_columns, extra_order)
                creator = SQL("CREATE TABLE {0} ({1})")
                creator = creator.format(
                    Identifier(name + "_extras"),
                    SQL(", ").join(processed_extra_columns),
                )
                self._execute(creator)
                self.grant_select(name + "_extras")
            creator = SQL(
                "CREATE TABLE {0} "
                "(cols jsonb, values jsonb, count bigint, "
                "extra boolean, split boolean DEFAULT FALSE)"
            )
            creator = creator.format(Identifier(name + "_counts"))
            self._execute(creator)
            self.grant_select(name + "_counts")
            self.grant_insert(name + "_counts")
            creator = SQL(
                "CREATE TABLE {0} "
                '(cols jsonb, stat text COLLATE "C", value numeric, '
                "constraint_cols jsonb, constraint_values jsonb, threshold integer)"
            )
            creator = creator.format(Identifier(name + "_stats"))
            self._execute(creator)
            self.grant_select(name + "_stats")
            self.grant_insert(name + "_stats")
            # FIXME use global constants ?
            inserter = SQL(
                "INSERT INTO meta_tables "
                "(name, sort, id_ordered, out_of_order, has_extras, label_col) "
                "VALUES (%s, %s, %s, %s, %s, %s)"
            )
            self._execute(
                inserter,
                [
                    name,
                    Json(sort),
                    id_ordered,
                    not id_ordered,
                    extra_columns is not None,
                    label_col,
                ],
            )
        self.__dict__[name] = self._search_table_class_(
            self,
            name,
            label_col,
            sort=sort,
            id_ordered=id_ordered,
            out_of_order=(not id_ordered),
            has_extras=(extra_columns is not None),
            total=0,
        )
        self.tablenames.append(name)
        self.tablenames.sort()
        self.log_db_change(
            "create_table",
            tablename=name,
            name=name,
            search_columns=search_columns,
            label_col=label_col,
            sort=sort,
            id_ordered=id_ordered,
            extra_columns=extra_columns,
            search_order=search_order,
            extra_order=extra_order,
        )
        print("Table %s created in %.3f secs" % (name, time.time() - now))
Example #19
0
 def unassign_user(self, user, group):
     self.execute(
         SQL("""ALTER GROUP {} DROP USER {}""").format(
             Identifier(group), Identifier(user)))
Example #20
0
    def rename_table(self, old_name, new_name, commit=True):
        """
        Rename a table.

        INPUT:

        - ``old_name`` -- the current name of the table, as a string
        - ``new_name`` -- the new name of the table, as a string
        """
        assert old_name != new_name
        assert new_name not in self.tablenames
        with DelayCommit(self, commit, silence=True):
            table = self[old_name]
            # first rename indexes and constraints
            icols = [Identifier(s) for s in ["index_name", "table_name"]]
            ccols = [Identifier(s) for s in ["constraint_name", "table_name"]]
            rename_index = SQL("ALTER INDEX IF EXISTS {0} RENAME TO {1}")
            rename_constraint = SQL("ALTER TABLE {0} RENAME CONSTRAINT {1} TO {2}")
            for meta, mname, cols in [
                ("meta_indexes", "index_name", icols),
                ("meta_indexes_hist", "index_name", icols),
                ("meta_constraints", "constraint_name", ccols),
                ("meta_constraints_hist", "constraint_name", ccols),
            ]:
                indexes = list(self._execute(
                    SQL("SELECT {0} FROM {1} WHERE table_name = %s").format(
                        Identifier(mname), Identifier(meta)
                    ),
                    [old_name],
                ))
                if indexes:
                    rename_index_in_meta = SQL("UPDATE {0} SET ({1}) = ({2}) WHERE {3} = {4}")
                    rename_index_in_meta = rename_index_in_meta.format(
                        Identifier(meta),
                        SQL(", ").join(cols),
                        SQL(", ").join(Placeholder() * len(cols)),
                        cols[0],
                        Placeholder(),
                    )
                    for old_index_name in indexes:
                        old_index_name = old_index_name[0]
                        new_index_name = old_index_name.replace(old_name, new_name)
                        self._execute(rename_index_in_meta, [new_index_name, new_name, old_index_name])
                        if meta == "meta_indexes":
                            self._execute(rename_index.format(
                                Identifier(old_index_name),
                                Identifier(new_index_name),
                            ))
                        elif meta == "meta_constraints":
                            self._execute(rename_constraint.format(
                                Identifier(old_name),
                                Identifier(old_index_name),
                                Identifier(new_index_name),
                            ))
            else:
                print("Renamed all indexes, constraints and the corresponding metadata")

            # rename meta_tables and meta_tables_hist
            rename_table_in_meta = SQL("UPDATE {0} SET name = %s WHERE name = %s")
            for meta in ["meta_tables", "meta_tables_hist"]:
                self._execute(rename_table_in_meta.format(Identifier(meta)), [new_name, old_name])
            else:
                print("Renamed all entries meta_tables(_hist)")

            rename = SQL("ALTER TABLE {0} RENAME TO {1}")
            # rename extra table
            if table.extra_table is not None:
                old_extra = table.extra_table
                assert old_extra == old_name + "_extras"
                new_extra = new_name + "_extras"
                self._execute(rename.format(Identifier(old_extra), Identifier(new_extra)))
                print("Renamed {0} to {1}".format(old_extra, new_extra))
            for suffix in ["", "_counts", "_stats"]:
                self._execute(rename.format(Identifier(old_name + suffix), Identifier(new_name + suffix)))
                print("Renamed {0} to {1}".format(old_name + suffix, new_name + suffix))

            # rename oldN tables
            for backup_number in range(table._next_backup_number()):
                for ext in ["", "_extras", "_counts", "_stats"]:
                    old_name_old = "{0}{1}_old{2}".format(old_name, ext, backup_number)
                    new_name_old = "{0}{1}_old{2}".format(new_name, ext, backup_number)
                    if self._table_exists(old_name_old):
                        self._execute(rename.format(Identifier(old_name_old), Identifier(new_name_old)))
                        print("Renamed {0} to {1}".format(old_name_old, new_name_old))
            for ext in ["", "_extras", "_counts", "_stats"]:
                old_name_tmp = "{0}{1}_tmp".format(old_name, ext)
                new_name_tmp = "{0}{1}_tmp".format(new_name, ext)
                if self._table_exists(old_name_tmp):
                    self._execute(rename.format(Identifier(old_name_tmp), Identifier(new_name_tmp)))
                    print("Renamed {0} to {1}".format(old_name_tmp, new_name_old))

            # initialized table
            tabledata = self._execute(
                SQL(
                    "SELECT name, label_col, sort, count_cutoff, id_ordered, "
                    "out_of_order, has_extras, stats_valid, total, include_nones "
                    "FROM meta_tables WHERE name = %s"
                ),
                [new_name],
            ).fetchone()
            table = self._search_table_class_(self, *tabledata)
            self.__dict__[new_name] = table
            self.tablenames.append(new_name)
            self.tablenames.remove(old_name)
            self.tablenames.sort()
Example #21
0
    def populateTableCalendar(self):
        print("Creating table calendar...")
        fName = self.gtfs_dir + GTFS.CALENDAR_FILE
        sql = """INSERT INTO {}(service_id, monday, tuesday, wednesday, thursday, friday, saturday, sunday)
                 VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                 ON CONFLICT DO NOTHING;
            """
        sql = SQL(sql).format(Identifier("calendar_"+str(self.region)))
        f = None
        calendar = {}
        try:
            if os.path.isfile(fName):
                with open(fName, 'r') as f:
                    line = f.readline()
                    line = f.readline()
                    while line:
                        (service_id, monday, tuesday, wednesday, thursday, friday, saturday, sunday, _ ,_) = line.split(",")
                        calendar_service = {}
                        if monday:
                            calendar_service[0] = True
                        if tuesday:
                            calendar_service[1] = True
                        if wednesday:
                            calendar_service[2] = True
                        if thursday:
                            calendar_service[3] = True
                        if friday:
                            calendar_service[4] = True
                        if saturday:
                            calendar_service[5] = True
                        if sunday:
                            calendar_service[6] = True
                        calendar[service_id] = calendar_service
                        line = f.readline()
            fName = self.gtfs_dir + GTFS.CALENDAR_DATE_FILE
            if os.path.isfile(fName):
                with open(fName, 'r') as f:
                    line = f.readline()
                    line = f.readline()
                    while line:
                        (service_id, date, exception_type) = line.split(",")
                        if int(exception_type) == 1:
                            day = datetime.strptime(date, '%Y%m%d').weekday()
                            if service_id in calendar:
                                calendar[service_id][day] = True
                            else:
                                calendar_service = {}
                                calendar_service[day] = True
                                calendar[service_id] = calendar_service

                        line = f.readline()
            #else:
            #    raise SystemExit("No calendar file provided!")
            cursor = self.conn.getCursor()
            for service_id in calendar:
                service = calendar[service_id]
                cursor.execute(sql, (service_id, service.get(0, False), service.get(1, False), service.get(2, False), service.get(3, False),
                                     service.get(4, False), service.get(5, False), service.get(6, False)))
            self.conn.commit()
            cursor.close()

        except IOError as e:
            print("I/O error({0}): {1}".format(e.errno, e.strerror))
        except (Exception, psycopg2.DatabaseError) as error:
            print(error)
        except:
            print("Unexpected error:", sys.exc_info()[0])
        finally:
            if f is not None:
                f.close()
Example #22
0
 def create_schema(self, schema: str) -> None:
     """Create a schema if it doesn't exist"""
     self.run_sql(
         SQL("CREATE SCHEMA IF NOT EXISTS {}").format(Identifier(schema)))
Example #23
0
    def populateTableLinks(self):
        print("Creating table links...")
        stop_parent = self.getStopsParent()

        sql = """INSERT INTO {}(link_id, stop_id, edge_id, source, target, edge_dist, source_ratio, edge_length, point_location,
                source_point_geom, point_target_geom)
                 VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
            """
        sql = SQL(sql).format(Identifier("links_"+str(self.region)))
        records = []
        link_id = 0
        list_pedestrian_ways = list(MultimodalNetwork.PEDESTRIAN_WAYS)
        try:
            cursor = self.conn.getCursor()
            sql_select = """
            WITH closest_candidates AS (
              SELECT
                id AS edge_id, source, target,
                geom_way as geom_way, km as km
              FROM
                {0}
              WHERE clazz= ANY(%s)
              ORDER BY
                geom_way <-> (select stop_location from {1} where stop_id = %s)
              LIMIT 100
            )

            SELECT edge_id, source, target,
            ST_Distance(stop_location::geography, geom_way::geography)/1000 AS edge_dist,
            source_ratio,
            km as edge_length,
            ST_LineInterpolatePoint(geom_way, source_ratio) as point_location,
            ST_GeometryN(ST_Split(ST_Snap(geom_way, ST_ClosestPoint(geom_way, stop_location), 0.00001), ST_ClosestPoint(geom_way, stop_location)), 1) as source_point_geom,
            ST_GeometryN(ST_Split(ST_Snap(geom_way, ST_ClosestPoint(geom_way, stop_location), 0.00001), ST_ClosestPoint(geom_way, stop_location)), 2) as point_target_geom
            FROM
            (SELECT geom_way, edge_id, source, target, km, stop_location as stop_location, ST_LineLocatePoint(geom_way, stop_location) as source_ratio
                FROM {1}, closest_candidates
                WHERE stop_id = %s
                ORDER BY
                ST_Distance(geom_way, stop_location)
                LIMIT 1) as r
            ;
            """
            sql_select = SQL(sql_select).format(Identifier("roadnet_"+str(self.region)),
                                                Identifier("stops_"+str(self.region)))

            for stop in stop_parent:
                parent = stop_parent[stop]
                if parent:
                    continue
                else:
                    cursor.execute(sql_select, (list_pedestrian_ways, stop, stop))
                    (edge_id, source, target, edge_dist, source_ratio, edge_length, point_location,
                     source_point_geom, point_target_geom) = cursor.fetchone()
                    records.append((link_id, stop, edge_id, source, target, edge_dist, source_ratio, edge_length,
                                             point_location, source_point_geom, point_target_geom))
                    link_id += 1

                    if link_id%100 == 0:
                        cursor.executemany(sql, records)
                        self.conn.commit()
                        records = []

            cursor.executemany(sql, records)
            self.conn.commit()
            cursor.close()
            stop_parent = None


        except IOError as e:
            print("I/O error({0}): {1}".format(e.errno, e.strerror))
        except (Exception, psycopg2.DatabaseError) as error:
            print(error)
        except:
            print("Unexpected error:", sys.exc_info()[0])
Example #24
0
    def copy_table(
        self,
        source_schema: str,
        source_table: str,
        target_schema: str,
        target_table: str,
        with_pk_constraints: bool = True,
    ) -> None:
        """Copy a table in the same engine, optionally applying primary key constraints as well."""

        if not self.table_exists(target_schema, target_table):
            query = SQL("CREATE TABLE {}.{} AS SELECT * FROM {}.{}").format(
                Identifier(target_schema),
                Identifier(target_table),
                Identifier(source_schema),
                Identifier(source_table),
            )
        else:
            query = SQL("INSERT INTO {}.{} SELECT * FROM {}.{}").format(
                Identifier(target_schema),
                Identifier(target_table),
                Identifier(source_schema),
                Identifier(source_table),
            )
        pks = self.get_primary_keys(source_schema, source_table)

        if with_pk_constraints and pks:
            query += (
                SQL(";ALTER TABLE {}.{} ADD PRIMARY KEY (").format(
                    Identifier(target_schema), Identifier(target_table)) +
                SQL(",").join(SQL("{}").format(Identifier(c))
                              for c, _ in pks) + SQL(")"))
        self.run_sql(query)
Example #25
0
    def query(self,
              startindex=0,
              limit=10,
              resulttype='results',
              bbox=[],
              datetime_=None,
              properties=[],
              sortby=[],
              select_properties=[],
              skip_geometry=False,
              q=None,
              **kwargs):
        """
        Query Postgis for all the content.
        e,g: http://localhost:5000/collections/hotosm_bdi_waterways/items?
        limit=1&resulttype=results

        :param startindex: starting record to return (default 0)
        :param limit: number of records to return (default 10)
        :param resulttype: return results or hit limit (default results)
        :param bbox: bounding box [minx,miny,maxx,maxy]
        :param datetime_: temporal (datestamp or extent)
        :param properties: list of tuples (name, value)
        :param sortby: list of dicts (property, order)
        :param select_properties: list of property names
        :param skip_geometry: bool of whether to skip geometry (default False)
        :param q: full-text search term(s)

        :returns: GeoJSON FeaturesCollection
        """
        LOGGER.debug('Querying PostGIS')

        if resulttype == 'hits':

            with DatabaseConnection(self.conn_dic, self.table,
                                    context="hits") as db:
                cursor = db.conn.cursor(cursor_factory=RealDictCursor)

                where_clause = self.__get_where_clauses(properties=properties,
                                                        bbox=bbox)
                sql_query = SQL("SELECT COUNT(*) as hits from {} {}").\
                    format(Identifier(self.table), where_clause)
                try:
                    cursor.execute(sql_query)
                except Exception as err:
                    LOGGER.error('Error executing sql_query: {}: {}'.format(
                        sql_query.as_string(cursor), err))
                    raise ProviderQueryError()

                hits = cursor.fetchone()["hits"]

            return self.__response_feature_hits(hits)

        end_index = startindex + limit

        with DatabaseConnection(self.conn_dic, self.table) as db:
            cursor = db.conn.cursor(cursor_factory=RealDictCursor)

            props = db.columns if select_properties == [] else \
                SQL(', ').join([Identifier(p) for p in select_properties])

            geom = SQL('') if skip_geometry else \
                SQL(",ST_AsGeoJSON({})").format(Identifier(self.geom))

            where_clause = self.__get_where_clauses(properties=properties,
                                                    bbox=bbox)

            orderby = self._make_orderby(sortby) if sortby else SQL('')

            sql_query = SQL("DECLARE \"geo_cursor\" CURSOR FOR \
             SELECT DISTINCT {} {} FROM {} {} {}"                                                 ).\
                format(props,
                       geom,
                       Identifier(self.table),
                       where_clause,
                       orderby)

            LOGGER.debug('SQL Query: {}'.format(sql_query.as_string(cursor)))
            LOGGER.debug('Start Index: {}'.format(startindex))
            LOGGER.debug('End Index: {}'.format(end_index))
            try:
                cursor.execute(sql_query)
                for index in [startindex, limit]:
                    cursor.execute(
                        "fetch forward {} from geo_cursor".format(index))
            except Exception as err:
                LOGGER.error('Error executing sql_query: {}'.format(
                    sql_query.as_string(cursor)))
                LOGGER.error(err)
                raise ProviderQueryError()

            row_data = cursor.fetchall()

            feature_collection = {'type': 'FeatureCollection', 'features': []}

            for rd in row_data:
                feature_collection['features'].append(
                    self.__response_feature(rd))

            return feature_collection
Example #26
0
 def delete_schema(self, schema: str) -> None:
     """Delete a schema if it exists, including all the tables in it."""
     self.run_sql(
         SQL("DROP SCHEMA IF EXISTS {} CASCADE").format(Identifier(schema)),
         return_shape=ResultShape.NONE,
     )
Example #27
0
 def insert_topic_query(self):
     return SQL(
         'INSERT INTO {0} (topic_name) VALUES (%(topic)s); '
         'SELECT MAX(topic_id) FROM {0} '
         'WHERE topic_name = %(topic)s').format(
         Identifier(self.topics_table))
Example #28
0
    def _store_changesets(
        self,
        table: "Table",
        changesets: Any,
        schema: str,
        extra_indexes: Optional[ExtraIndexInfo] = None,
        in_fragment_order: Optional[List[str]] = None,
        overwrite: bool = False,
        table_name: Optional[str] = None,
    ) -> List[str]:
        """
        Store and register multiple changesets as fragments.

        :param table: Table object the changesets belong to
        :param changesets: List of changeset dictionaries. Empty changesets will be ignored.
        :param schema: Schema the table is checked out into.
        :param extra_indexes: Dictionary of {index_type: column: index_specific_kwargs}.
        :param overwrite: Overwrite object if already exists.
        :return: List of created object IDs.
        """
        object_ids = []
        logging.info("Storing and indexing table %s", table.table_name)
        table_name = table_name or table.table_name
        for sub_changeset in tqdm(changesets,
                                  unit="objs",
                                  ascii=SG_CMD_ASCII,
                                  disable=len(changesets) < 3):
            if not sub_changeset:
                continue
            # Store the fragment in a temporary location and then find out its hash and rename to the actual target.
            # Optimisation: in the future, we can hash the upserted rows that we need preemptively and possibly
            # avoid storing the object altogether if it's a duplicate.
            tmp_object_id = self._store_changeset(sub_changeset, table_name,
                                                  schema, table.table_schema)

            (
                deletion_hash,
                insertion_hash,
                object_id,
                rows_inserted,
                rows_deleted,
            ) = self._get_patch_fragment_hashes_stats(sub_changeset, table,
                                                      tmp_object_id)

            object_ids.append(object_id)

            # Wrap this rename in a SAVEPOINT so that if the table already exists,
            # the error doesn't roll back the whole transaction (us creating and registering all other objects).
            with self.object_engine.savepoint("object_rename"):
                source_query = SQL("SELECT * FROM {}.{}").format(
                    Identifier("pg_temp"), Identifier(tmp_object_id))

                if in_fragment_order:
                    source_query += SQL(" ") + self._get_order_by_clause(
                        in_fragment_order, table.table_schema)

                try:
                    self.object_engine.store_object(
                        object_id=object_id,
                        source_query=source_query,
                        schema_spec=add_ud_flag_column(table.table_schema),
                        overwrite=overwrite,
                    )
                except UniqueViolation:
                    # Someone registered this object (perhaps a concurrent pull) already.
                    logging.info(
                        "Object %s for table %s/%s already exists, continuing...",
                        object_id,
                        table.repository,
                        table.table_name,
                    )
                self.object_engine.delete_table("pg_temp", tmp_object_id)
                # There are some cases where an object can already exist in the object engine (in the cache)
                # but has been deleted from the metadata engine, so when it's recreated, we'll skip
                # actually registering it. Hence, we still want to proceed trying to register
                # it no matter what.

            # Same here: if we are being called as part of a commit and an object
            # already exists, we'll roll back everything that the caller has done
            # (e.g. registering the new image) if we don't have a savepoint.
            with self.metadata_engine.savepoint("object_register"):
                try:
                    self._register_object(
                        object_id,
                        namespace=table.repository.namespace,
                        insertion_hash=insertion_hash.hex(),
                        deletion_hash=deletion_hash.hex(),
                        table_schema=table.table_schema,
                        changeset=sub_changeset,
                        extra_indexes=extra_indexes,
                        rows_inserted=rows_inserted,
                        rows_deleted=rows_deleted,
                    )
                except UniqueViolation:
                    logging.info(
                        "Object %s for table %s/%s already exists, continuing...",
                        object_id,
                        table.repository,
                        table.table_name,
                    )

        return object_ids
Example #29
0
 def update_agg_topic_stmt(self):
     return SQL(
         'UPDATE {} SET agg_topic_name = %s '
         'WHERE agg_topic_id = %s').format(
         Identifier(self.agg_topics_table))
Example #30
0
def _qual_to_sql_clause(qual: Tuple[str, str, str],
                        ctype: str) -> Tuple[Composed, Tuple[str]]:
    """Convert a qual to a normal SQL clause that can be run against the actual object rather than the index."""
    column_name, qual_op, value = qual
    return SQL("{}::" + ctype + " " + qual_op + " %s").format(
        Identifier(column_name)), (value, )