Beispiel #1
0
    def materialize(
        self,
        destination: str,
        destination_schema: Optional[str] = None,
        lq_server: Optional[str] = None,
    ) -> None:
        """
        Materializes a Splitgraph table in the target schema as a normal Postgres table, potentially downloading all
        required objects and using them to reconstruct the table.

        :param destination: Name of the destination table.
        :param destination_schema: Name of the destination schema.
        :param lq_server: If set, sets up a layered querying FDW for the table instead using this foreign server.
        """
        destination_schema = destination_schema or self.repository.to_schema()
        engine = self.repository.object_engine
        object_manager = self.repository.objects
        engine.delete_table(destination_schema, destination)

        if not lq_server:
            # Materialize by applying fragments to one another in their dependency order.
            with object_manager.ensure_objects(
                    table=self, objects=self.objects) as required_objects:
                engine.create_table(
                    schema=destination_schema,
                    table=destination,
                    schema_spec=self.table_schema,
                    include_comments=True,
                    unlogged=True,
                )
                if required_objects:
                    logging.debug("Applying %s...",
                                  pluralise("fragment", len(required_objects)))

                    table_size = self.get_size()

                    progress_every: Optional[int]
                    if table_size > _PROGRESS_EVERY:
                        progress_every = int(
                            ceil(
                                len(required_objects) * _PROGRESS_EVERY /
                                float(table_size)))
                    else:
                        progress_every = None

                    engine.apply_fragments(
                        [(SPLITGRAPH_META_SCHEMA, d)
                         for d in cast(List[str], required_objects)],
                        destination_schema,
                        destination,
                        progress_every=progress_every,
                    )
        else:
            query, args = create_foreign_table(destination_schema, lq_server,
                                               self.table_name,
                                               self.table_schema)

            engine.run_sql(query, args)
Beispiel #2
0
def generate_socrata_mount_queries(sought_ids, datasets, mountpoint, server_id, tables):
    # Local imports since this module gets run from commandline entrypoint on startup.

    from splitgraph.core.output import slugify
    from splitgraph.core.output import truncate_list
    from splitgraph.core.output import pluralise
    from splitgraph.core.table import create_foreign_table
    from splitgraph.ingestion.socrata.querying import socrata_to_sg_schema

    found_ids = set(d["resource"]["id"] for d in datasets)
    logging.info("Loaded metadata for %s", pluralise("Socrata table", len(found_ids)))

    if tables:
        missing_ids = [d for d in found_ids if d not in sought_ids]
        if missing_ids:
            raise ValueError(
                "Some Socrata tables couldn't be found! Missing tables: %s"
                % truncate_list(missing_ids)
            )

        tables_inv = {s: p for p, s in tables.items()}
    else:
        tables_inv = {}

    mount_statements = []
    mount_args = []
    for dataset in datasets:
        socrata_id = dataset["resource"]["id"]
        table_name = tables_inv.get(socrata_id) or slugify(
            dataset["resource"]["name"]
        ) + "_" + socrata_id.replace("-", "_")
        schema_spec, column_map = socrata_to_sg_schema(dataset)
        sql, args = create_foreign_table(
            schema=mountpoint,
            server=server_id,
            table_name=table_name,
            schema_spec=schema_spec,
            internal_table_name=socrata_id,
            extra_options={"column_map": json.dumps(column_map)},
        )

        description = dataset["resource"].get("description")
        if description:
            sql += SQL("COMMENT ON FOREIGN TABLE {}.{} IS %s").format(
                Identifier(mountpoint), Identifier(table_name)
            )
            args.append(description)

        mount_statements.append(sql)
        mount_args.extend(args)

    return mount_statements, mount_args
Beispiel #3
0
def generate_socrata_mount_queries(sought_ids, datasets, mountpoint, server_id,
                                   tables: TableInfo):
    # Local imports since this module gets run from commandline entrypoint on startup.

    from splitgraph.core.output import slugify
    from splitgraph.core.output import pluralise
    from splitgraph.ingestion.socrata.querying import socrata_to_sg_schema

    found_ids = set(d["resource"]["id"] for d in datasets)
    logging.info("Loaded metadata for %s",
                 pluralise("Socrata table", len(found_ids)))

    tables_inv = _get_table_map(found_ids, sought_ids, tables)

    mount_statements = []
    mount_args = []
    for dataset in datasets:
        socrata_id = dataset["resource"]["id"]
        table_name = tables_inv.get(socrata_id) or slugify(
            dataset["resource"]["name"]) + "_" + socrata_id.replace("-", "_")
        schema_spec, column_map = socrata_to_sg_schema(dataset)
        sql, args = create_foreign_table(
            schema=mountpoint,
            server=server_id,
            table_name=table_name,
            schema_spec=schema_spec,
            extra_options={
                "column_map": json.dumps(column_map),
                "table": socrata_id
            },
        )

        description = dataset["resource"].get("description")
        if description:
            sql += SQL("COMMENT ON FOREIGN TABLE {}.{} IS %s").format(
                Identifier(mountpoint), Identifier(table_name))
            args.append(description)

        mount_statements.append(sql)
        mount_args.extend(args)

    return mount_statements, mount_args
Beispiel #4
0
def reindex_c(image_spec, table_name, index_options, ignore_patch_objects):
    """
    Run extra indexes on a table. This will merge the indexing results for all objects
    that a table is formed from with the current object indexes. For explanation of
    what indexes do, see the documentation for `sgr commit`.

    If the objects haven't been downloaded yet, this will download them.

    Currently reindexing objects that change other objects is unsupported and will raise
    an error. Pass `-o` to ignore these objects and only reindex supported objects.

    Image spec must be of the format ``[NAMESPACE/]REPOSITORY[:HASH_OR_TAG]``. If no tag is specified, ``HEAD`` is used.
    """
    from splitgraph.core.output import pluralise

    repository, image = image_spec
    table = image.get_table(table_name)
    click.echo("Reindexing table %s:%s/%s" %
               (repository.to_schema(), image.image_hash, table_name))
    reindexed = table.reindex(extra_indexes=index_options,
                              raise_on_patch_objects=not ignore_patch_objects)
    click.echo("Reindexed %s" % pluralise("object", len(reindexed)))