Esempio n. 1
0
    def __init__(self,
                 object_engine: "PostgresEngine",
                 metadata_engine: Optional["PostgresEngine"] = None) -> None:
        """
        :param object_engine: An ObjectEngine that will be used as a backing store for the
            objects.
        :param metadata_engine: An SQLEngine that will be used to store/query metadata for Splitgraph
            images and objects. By default, `object_engine` is used.
        """
        super().__init__(object_engine, metadata_engine)

        # Cache size in bytes
        self.cache_size = int(get_singleton(
            CONFIG, "SG_OBJECT_CACHE_SIZE")) * 1024 * 1024

        # 0 to infinity; higher means objects with smaller sizes are more likely to
        # get evicted than objects that haven't been used for a while.
        # Currently calculated so that an object that hasn't been accessed for 5 minutes has the same
        # removal priority as an object twice its size that's just been accessed.
        self.eviction_decay_constant = float(
            get_singleton(CONFIG, "SG_EVICTION_DECAY"))

        # Objects smaller than this size are assumed to have this size (to simulate the latency of
        # downloading them).
        self.eviction_floor = float(get_singleton(
            CONFIG, "SG_EVICTION_FLOOR")) * 1024 * 1024

        # Fraction of the cache size to free when eviction is run (the greater value of this amount and the
        # amount needed to download required objects is actually freed). Eviction is an expensive operation
        # (it pauses concurrent downloads) so increasing this makes eviction happen less often at the cost
        # of more possible cache misses.
        self.eviction_min_fraction = float(
            get_singleton(CONFIG, "SG_EVICTION_MIN_FRACTION"))
Esempio n. 2
0
    def get_latest_version(self) -> Optional[str]:
        # Do a version check to see if updates are available. If the user is logged
        # into the registry, also send the user ID for metrics.
        # The user can opt out by setting "SG_UPDATE_FREQUENCY" to 0 or opt out of
        # sending user ID by setting SG_UPDATE_ANONYMOUS to true.

        config = create_config_dict()
        frequency = int(get_singleton(config, "SG_UPDATE_FREQUENCY"))

        if frequency == 0:
            return None

        last_check = int(get_singleton(config, "SG_UPDATE_LAST"))
        now = int(time.time())

        if last_check + frequency > now:
            return None

        headers = get_headers()
        if get_singleton(config, "SG_UPDATE_ANONYMOUS").lower() == "false":
            try:
                headers.update(
                    {"Authorization": "Bearer " + self.access_token})
            except AuthAPIError:
                pass

        try:
            logging.debug("Running update check")
            response = requests.post(
                self.endpoint + "/update_check",
                verify=self.verify,
                headers=headers,
            )
            response.raise_for_status()
            latest_version = str(response.json()["latest_version"])
        except requests.RequestException as e:
            logging.debug("Error running the update check", exc_info=e)
            return None
        except KeyError:
            logging.debug("Malformed response from the update service")
            return None

        try:
            patch_and_save_config(config, {"SG_UPDATE_LAST": str(now)})
        except Exception as e:
            logging.debug("Error patching the config", exc_info=e)
            return latest_version

        return latest_version
Esempio n. 3
0
    def query_schema(self,
                     wrapper: Optional[str] = FDW_CLASS,
                     commit: bool = True) -> Iterator[str]:
        """
        Creates a temporary schema with tables in this image mounted as foreign tables that can be accessed via
        read-only layered querying. On exit from the context manager, the schema is discarded.

        :return: The name of the schema the image is located in.
        """
        tmp_schema = str.format("o{:032x}", getrandbits(128))
        try:
            self.object_engine.create_schema(tmp_schema)
            self._lq_checkout(target_schema=tmp_schema, wrapper=wrapper)
            if commit:
                self.object_engine.commit(
                )  # Make sure the new tables are seen by other connections

            # Inject extra query planner hints as session variables if specified.
            lq_tuning = get_singleton(CONFIG, "SG_LQ_TUNING")
            if lq_tuning:
                self.object_engine.run_sql(lq_tuning)
            yield tmp_schema
        finally:
            self.object_engine.run_sql(
                SQL("DROP SCHEMA IF EXISTS {} CASCADE; DROP SERVER IF EXISTS {} CASCADE;"
                    ).format(Identifier(tmp_schema),
                             Identifier(tmp_schema + "_lq_checkout_server")))
Esempio n. 4
0
    def access_token(self) -> str:
        """
        Will return an up-to-date access token by either getting it from
        the configuration file or contacting the auth service for a new one.
        Will write the new access token into the configuration file.

        :return: Access token.
        """

        config = create_config_dict()

        try:
            current_access_token = get_from_subsection(
                config, "remotes", self.remote, "SG_CLOUD_ACCESS_TOKEN")
            exp = get_token_claim(current_access_token, "exp")
            now = time.time()
            if now < exp - self.access_token_expiry_tolerance:
                return current_access_token
        except KeyError:
            pass

        # Token expired or non-existent, get a new one.
        try:
            api_key = get_from_subsection(config, "remotes", self.remote,
                                          "SG_ENGINE_USER")
            api_secret = get_from_subsection(config, "remotes", self.remote,
                                             "SG_ENGINE_PWD")
            new_access_token = cast(
                str, self.get_access_token_from_api(api_key, api_secret))
        except KeyError as e:
            try:
                refresh_token = get_from_subsection(config, "remotes",
                                                    self.remote,
                                                    "SG_CLOUD_REFRESH_TOKEN")
                new_access_token = cast(str,
                                        self.get_access_token(refresh_token))
            except KeyError:
                raise AuthAPIError((
                    "No refresh token or API keys found in the config for remote %s! "
                    % self.remote
                ) + "Log into the registry using sgr cloud login.") from e

        set_in_subsection(config, "remotes", self.remote,
                          "SG_CLOUD_ACCESS_TOKEN", new_access_token)
        overwrite_config(config, get_singleton(config, "SG_CONFIG_FILE"))
        return new_access_token
Esempio n. 5
0
def serialize_config(config: ConfigDict,
                     config_format: bool,
                     no_shielding: bool,
                     include_defaults: bool = True) -> str:
    """
    Pretty-print the configuration or print it in the Splitgraph config file format.

    :param config: Configuration dictionary.
    :param config_format: Output configuration in the Splitgraph config file format.
    :param no_shielding: Don't replace sensitive values (like passwords) with asterisks
    :param include_defaults: Emit the config variable even if it's the same as the default.
    :return: Textual representation of the config.
    """

    result = "[defaults]\n" if config_format else ""

    # Emit normal config parameters
    for key in KEYS:
        if config_format and key in _SITUATIONAL_PARAMS:
            continue
        value = get_singleton(config, key)
        if include_defaults or key not in DEFAULTS or value != DEFAULTS[key]:
            result += _kv_to_str(key, value, no_shielding) + "\n"

    # Emit hoisted remotes
    result += "\nCurrent registered remote engines:\n" if not config_format else ""
    for remote, remote_config in get_all_in_section(config, "remotes").items():
        assert isinstance(remote_config, dict)
        if config_format:
            result += "\n" + serialize_engine_config(remote, remote_config,
                                                     no_shielding) + "\n"
        else:
            result += "\n%s:\n" % remote
            for key, value in remote_config.items():
                result += _kv_to_str(key, value, no_shielding) + "\n"

    # Print Splitfile commands
    if "commands" in config:
        result += "\nSplitfile command plugins:\n" if not config_format else "[commands]\n"
        for command_name, command_class in get_all_in_section(
                config, "commands").items():
            result += _kv_to_str(command_name, cast(str, command_class),
                                 no_shielding) + "\n"

    # Print data sources
    if "data_sources" in config:
        result += "\nData sources:\n" if not config_format else "[data_sources]\n"
        for handler_name, handler_func in get_all_in_section(
                config, "data_sources").items():
            result += _kv_to_str(handler_name, cast(str, handler_func),
                                 no_shielding) + "\n"

    # Print external object handlers
    if "external_handlers" in config:
        result += "\nExternal object handlers:\n" if not config_format else "[external_handlers]\n"
        for handler_name, handler_func in get_all_in_section(
                config, "external_handlers").items():
            result += _kv_to_str(handler_name, cast(str, handler_func),
                                 no_shielding) + "\n"

    return result
Esempio n. 6
0
@click.command(name="commit")
@click.argument("repository", type=RepositoryType(exists=True))
@click.option(
    "-s",
    "--snap",
    default=False,
    is_flag=True,
    help=
    "Do not delta compress the changes and instead store the whole table again. "
    "This consumes more space, but makes checkouts faster.",
)
@click.option(
    "-c",
    "--chunk-size",
    default=int(get_singleton(CONFIG, "SG_COMMIT_CHUNK_SIZE")),
    type=int,
    help=
    "Split new tables into chunks of this many rows (by primary key). The default "
    "value is governed by the SG_COMMIT_CHUNK_SIZE configuration parameter.",
)
@click.option(
    "-k",
    "--chunk-sort-keys",
    default=None,
    type=JsonType(),
    help="Sort the data inside each chunk by this/these key(s)",
)
@click.option(
    "-t",
    "--split-changesets",
Esempio n. 7
0
    def upload_objects(
            self, objects: List[str],
            remote_engine: "PsycopgEngine") -> List[Tuple[str, str]]:
        """
        Upload objects to Minio

        :param remote_engine: Remote Engine class
        :param objects: List of object IDs to upload
        :return: List of tuples with successfully uploaded objects and their URLs.
        """
        worker_threads = self.params.get(
            "threads",
            int(get_singleton(CONFIG, "SG_ENGINE_POOL")) - 1)

        # Determine upload URLs
        logging.info("Getting upload URLs from the registry...")
        urls = get_object_upload_urls(remote_engine, objects)

        local_engine = get_engine()

        def _do_upload(object_url):
            object_id, url = object_url
            # We get 3 URLs here (one for each of object itself, footer and schema -- emit
            # just the first one for logging)
            logging.debug("%s -> %s", object_id, url[0])
            try:
                local_engine.run_api_call("upload_object", object_id, url)
                return object_id
            except Exception:
                logging.exception("Error uploading object %s", object_id)
                return None

        successful: List[str] = []
        try:
            local_engine.autocommit = True
            with ThreadPoolExecutor(max_workers=worker_threads) as tpe:
                pbar = tqdm(
                    tpe.map(_do_upload, zip(objects, urls)),
                    total=len(objects),
                    unit="objs",
                    ascii=SG_CMD_ASCII,
                )
                for object_id in pbar:
                    if object_id:
                        successful.append(object_id)
                        pbar.set_postfix(object=object_id[:10] + "...")
            if len(successful) < len(objects):
                raise IncompleteObjectUploadError(
                    reason=None,
                    successful_objects=successful,
                    successful_object_urls=successful,
                )
            # The "URL" in this case is the same object ID: we ask the registry
            # for the actual URL by giving it the object ID.
            return [(s, s) for s in successful]
        except KeyboardInterrupt as e:
            raise IncompleteObjectUploadError(
                reason=e,
                successful_objects=successful,
                successful_object_urls=successful,
            )
        finally:
            local_engine.autocommit = False
            local_engine.close_others()
Esempio n. 8
0
    def download_objects(self, objects: List[Tuple[str, str]],
                         remote_engine: "PsycopgEngine") -> List[str]:
        """
        Download objects from Minio.

        :param objects: List of (object ID, object URL (object ID it's stored under))
        """
        # By default, take up the whole connection pool with downloaders
        # (less one connection for the main thread that handles metadata)
        worker_threads = self.params.get(
            "threads",
            int(get_singleton(CONFIG, "SG_ENGINE_POOL")) - 1)

        logging.info("Getting download URLs from registry %s...",
                     remote_engine)
        object_ids = [o[0] for o in objects]
        remote_object_ids = [o[1] for o in objects]
        urls = get_object_download_urls(remote_engine, remote_object_ids)

        local_engine = get_engine()

        def _do_download(object_url):
            object_id, url = object_url
            logging.debug("%s -> %s", url[0], object_id)

            try:
                local_engine.run_api_call("download_object", object_id, url)
                local_engine.mount_object(object_id)
            except Exception as e:
                logging.error("Error downloading object %s: %s", object_id,
                              str(e))

                # Delete the object that we just tried to download to make sure we don't have
                # a situation where the file was downloaded but mounting failed (currently
                # we inspect the filesystem to see the list of downloaded objects).
                # TODO figure out a flow for just remounting objects whose files we already have.
                local_engine.delete_objects([object_id])
                return None

            return object_id

        successful: List[str] = []

        try:
            # Temporarily set the engine into autocommit mode. This is because a transaction
            # commit resets session state and makes the download_object engine API call
            # import all of its Python modules again (which takes about 300ms). It also
            # resets the SD and GD dictionaries so it's not possible to cache those modules
            # there either.
            local_engine.autocommit = True
            with ThreadPoolExecutor(max_workers=worker_threads) as tpe:
                # Evaluate the results so that exceptions thrown by the downloader get raised
                pbar = tqdm(
                    tpe.map(_do_download, zip(object_ids, urls)),
                    total=len(objects),
                    unit="obj",
                    ascii=SG_CMD_ASCII,
                )
                for object_id in pbar:
                    if object_id:
                        successful.append(object_id)
                        pbar.set_postfix(object=object_id[:10] + "...")
            if len(successful) < len(object_ids):
                raise IncompleteObjectDownloadError(
                    reason=None, successful_objects=successful)
            return successful
        except KeyboardInterrupt as e:
            raise IncompleteObjectDownloadError(reason=e,
                                                successful_objects=successful)
        finally:
            # Flip the engine back and close all but one pool connection.
            local_engine.autocommit = False
            local_engine.close_others()
Esempio n. 9
0
def _parse_paths_overrides(
    lookup_path: str, override_path: str
) -> Tuple[List[str], Dict[str, str]]:
    return (
        lookup_path.split(",") if lookup_path else [],
        {r[: r.index(":")]: r[r.index(":") + 1 :] for r in override_path.split(",")}
        if override_path
        else {},
    )


# Parse and set these on import. If we ever need to be able to reread the config on the fly, these have to be
# recalculated.
_LOOKUP_PATH, _LOOKUP_PATH_OVERRIDE = _parse_paths_overrides(
    get_singleton(CONFIG, "SG_REPO_LOOKUP"), get_singleton(CONFIG, "SG_REPO_LOOKUP_OVERRIDE")
)


def init_engine(skip_object_handling: bool = False) -> None:  # pragma: no cover
    # Method exercised in test_commandline.test_init_new_db but in
    # an external process
    """
    Initializes the engine by:

        * performing any required engine-custom initialization
        * creating the metadata tables

    :param skip_object_handling: If True, skips installing routines related to
        object handling and checkouts (like audit triggers and CStore management).
    """