def __init__(self, object_engine: "PostgresEngine", metadata_engine: Optional["PostgresEngine"] = None) -> None: """ :param object_engine: An ObjectEngine that will be used as a backing store for the objects. :param metadata_engine: An SQLEngine that will be used to store/query metadata for Splitgraph images and objects. By default, `object_engine` is used. """ super().__init__(object_engine, metadata_engine) # Cache size in bytes self.cache_size = int(get_singleton( CONFIG, "SG_OBJECT_CACHE_SIZE")) * 1024 * 1024 # 0 to infinity; higher means objects with smaller sizes are more likely to # get evicted than objects that haven't been used for a while. # Currently calculated so that an object that hasn't been accessed for 5 minutes has the same # removal priority as an object twice its size that's just been accessed. self.eviction_decay_constant = float( get_singleton(CONFIG, "SG_EVICTION_DECAY")) # Objects smaller than this size are assumed to have this size (to simulate the latency of # downloading them). self.eviction_floor = float(get_singleton( CONFIG, "SG_EVICTION_FLOOR")) * 1024 * 1024 # Fraction of the cache size to free when eviction is run (the greater value of this amount and the # amount needed to download required objects is actually freed). Eviction is an expensive operation # (it pauses concurrent downloads) so increasing this makes eviction happen less often at the cost # of more possible cache misses. self.eviction_min_fraction = float( get_singleton(CONFIG, "SG_EVICTION_MIN_FRACTION"))
def get_latest_version(self) -> Optional[str]: # Do a version check to see if updates are available. If the user is logged # into the registry, also send the user ID for metrics. # The user can opt out by setting "SG_UPDATE_FREQUENCY" to 0 or opt out of # sending user ID by setting SG_UPDATE_ANONYMOUS to true. config = create_config_dict() frequency = int(get_singleton(config, "SG_UPDATE_FREQUENCY")) if frequency == 0: return None last_check = int(get_singleton(config, "SG_UPDATE_LAST")) now = int(time.time()) if last_check + frequency > now: return None headers = get_headers() if get_singleton(config, "SG_UPDATE_ANONYMOUS").lower() == "false": try: headers.update( {"Authorization": "Bearer " + self.access_token}) except AuthAPIError: pass try: logging.debug("Running update check") response = requests.post( self.endpoint + "/update_check", verify=self.verify, headers=headers, ) response.raise_for_status() latest_version = str(response.json()["latest_version"]) except requests.RequestException as e: logging.debug("Error running the update check", exc_info=e) return None except KeyError: logging.debug("Malformed response from the update service") return None try: patch_and_save_config(config, {"SG_UPDATE_LAST": str(now)}) except Exception as e: logging.debug("Error patching the config", exc_info=e) return latest_version return latest_version
def query_schema(self, wrapper: Optional[str] = FDW_CLASS, commit: bool = True) -> Iterator[str]: """ Creates a temporary schema with tables in this image mounted as foreign tables that can be accessed via read-only layered querying. On exit from the context manager, the schema is discarded. :return: The name of the schema the image is located in. """ tmp_schema = str.format("o{:032x}", getrandbits(128)) try: self.object_engine.create_schema(tmp_schema) self._lq_checkout(target_schema=tmp_schema, wrapper=wrapper) if commit: self.object_engine.commit( ) # Make sure the new tables are seen by other connections # Inject extra query planner hints as session variables if specified. lq_tuning = get_singleton(CONFIG, "SG_LQ_TUNING") if lq_tuning: self.object_engine.run_sql(lq_tuning) yield tmp_schema finally: self.object_engine.run_sql( SQL("DROP SCHEMA IF EXISTS {} CASCADE; DROP SERVER IF EXISTS {} CASCADE;" ).format(Identifier(tmp_schema), Identifier(tmp_schema + "_lq_checkout_server")))
def access_token(self) -> str: """ Will return an up-to-date access token by either getting it from the configuration file or contacting the auth service for a new one. Will write the new access token into the configuration file. :return: Access token. """ config = create_config_dict() try: current_access_token = get_from_subsection( config, "remotes", self.remote, "SG_CLOUD_ACCESS_TOKEN") exp = get_token_claim(current_access_token, "exp") now = time.time() if now < exp - self.access_token_expiry_tolerance: return current_access_token except KeyError: pass # Token expired or non-existent, get a new one. try: api_key = get_from_subsection(config, "remotes", self.remote, "SG_ENGINE_USER") api_secret = get_from_subsection(config, "remotes", self.remote, "SG_ENGINE_PWD") new_access_token = cast( str, self.get_access_token_from_api(api_key, api_secret)) except KeyError as e: try: refresh_token = get_from_subsection(config, "remotes", self.remote, "SG_CLOUD_REFRESH_TOKEN") new_access_token = cast(str, self.get_access_token(refresh_token)) except KeyError: raise AuthAPIError(( "No refresh token or API keys found in the config for remote %s! " % self.remote ) + "Log into the registry using sgr cloud login.") from e set_in_subsection(config, "remotes", self.remote, "SG_CLOUD_ACCESS_TOKEN", new_access_token) overwrite_config(config, get_singleton(config, "SG_CONFIG_FILE")) return new_access_token
def serialize_config(config: ConfigDict, config_format: bool, no_shielding: bool, include_defaults: bool = True) -> str: """ Pretty-print the configuration or print it in the Splitgraph config file format. :param config: Configuration dictionary. :param config_format: Output configuration in the Splitgraph config file format. :param no_shielding: Don't replace sensitive values (like passwords) with asterisks :param include_defaults: Emit the config variable even if it's the same as the default. :return: Textual representation of the config. """ result = "[defaults]\n" if config_format else "" # Emit normal config parameters for key in KEYS: if config_format and key in _SITUATIONAL_PARAMS: continue value = get_singleton(config, key) if include_defaults or key not in DEFAULTS or value != DEFAULTS[key]: result += _kv_to_str(key, value, no_shielding) + "\n" # Emit hoisted remotes result += "\nCurrent registered remote engines:\n" if not config_format else "" for remote, remote_config in get_all_in_section(config, "remotes").items(): assert isinstance(remote_config, dict) if config_format: result += "\n" + serialize_engine_config(remote, remote_config, no_shielding) + "\n" else: result += "\n%s:\n" % remote for key, value in remote_config.items(): result += _kv_to_str(key, value, no_shielding) + "\n" # Print Splitfile commands if "commands" in config: result += "\nSplitfile command plugins:\n" if not config_format else "[commands]\n" for command_name, command_class in get_all_in_section( config, "commands").items(): result += _kv_to_str(command_name, cast(str, command_class), no_shielding) + "\n" # Print data sources if "data_sources" in config: result += "\nData sources:\n" if not config_format else "[data_sources]\n" for handler_name, handler_func in get_all_in_section( config, "data_sources").items(): result += _kv_to_str(handler_name, cast(str, handler_func), no_shielding) + "\n" # Print external object handlers if "external_handlers" in config: result += "\nExternal object handlers:\n" if not config_format else "[external_handlers]\n" for handler_name, handler_func in get_all_in_section( config, "external_handlers").items(): result += _kv_to_str(handler_name, cast(str, handler_func), no_shielding) + "\n" return result
@click.command(name="commit") @click.argument("repository", type=RepositoryType(exists=True)) @click.option( "-s", "--snap", default=False, is_flag=True, help= "Do not delta compress the changes and instead store the whole table again. " "This consumes more space, but makes checkouts faster.", ) @click.option( "-c", "--chunk-size", default=int(get_singleton(CONFIG, "SG_COMMIT_CHUNK_SIZE")), type=int, help= "Split new tables into chunks of this many rows (by primary key). The default " "value is governed by the SG_COMMIT_CHUNK_SIZE configuration parameter.", ) @click.option( "-k", "--chunk-sort-keys", default=None, type=JsonType(), help="Sort the data inside each chunk by this/these key(s)", ) @click.option( "-t", "--split-changesets",
def upload_objects( self, objects: List[str], remote_engine: "PsycopgEngine") -> List[Tuple[str, str]]: """ Upload objects to Minio :param remote_engine: Remote Engine class :param objects: List of object IDs to upload :return: List of tuples with successfully uploaded objects and their URLs. """ worker_threads = self.params.get( "threads", int(get_singleton(CONFIG, "SG_ENGINE_POOL")) - 1) # Determine upload URLs logging.info("Getting upload URLs from the registry...") urls = get_object_upload_urls(remote_engine, objects) local_engine = get_engine() def _do_upload(object_url): object_id, url = object_url # We get 3 URLs here (one for each of object itself, footer and schema -- emit # just the first one for logging) logging.debug("%s -> %s", object_id, url[0]) try: local_engine.run_api_call("upload_object", object_id, url) return object_id except Exception: logging.exception("Error uploading object %s", object_id) return None successful: List[str] = [] try: local_engine.autocommit = True with ThreadPoolExecutor(max_workers=worker_threads) as tpe: pbar = tqdm( tpe.map(_do_upload, zip(objects, urls)), total=len(objects), unit="objs", ascii=SG_CMD_ASCII, ) for object_id in pbar: if object_id: successful.append(object_id) pbar.set_postfix(object=object_id[:10] + "...") if len(successful) < len(objects): raise IncompleteObjectUploadError( reason=None, successful_objects=successful, successful_object_urls=successful, ) # The "URL" in this case is the same object ID: we ask the registry # for the actual URL by giving it the object ID. return [(s, s) for s in successful] except KeyboardInterrupt as e: raise IncompleteObjectUploadError( reason=e, successful_objects=successful, successful_object_urls=successful, ) finally: local_engine.autocommit = False local_engine.close_others()
def download_objects(self, objects: List[Tuple[str, str]], remote_engine: "PsycopgEngine") -> List[str]: """ Download objects from Minio. :param objects: List of (object ID, object URL (object ID it's stored under)) """ # By default, take up the whole connection pool with downloaders # (less one connection for the main thread that handles metadata) worker_threads = self.params.get( "threads", int(get_singleton(CONFIG, "SG_ENGINE_POOL")) - 1) logging.info("Getting download URLs from registry %s...", remote_engine) object_ids = [o[0] for o in objects] remote_object_ids = [o[1] for o in objects] urls = get_object_download_urls(remote_engine, remote_object_ids) local_engine = get_engine() def _do_download(object_url): object_id, url = object_url logging.debug("%s -> %s", url[0], object_id) try: local_engine.run_api_call("download_object", object_id, url) local_engine.mount_object(object_id) except Exception as e: logging.error("Error downloading object %s: %s", object_id, str(e)) # Delete the object that we just tried to download to make sure we don't have # a situation where the file was downloaded but mounting failed (currently # we inspect the filesystem to see the list of downloaded objects). # TODO figure out a flow for just remounting objects whose files we already have. local_engine.delete_objects([object_id]) return None return object_id successful: List[str] = [] try: # Temporarily set the engine into autocommit mode. This is because a transaction # commit resets session state and makes the download_object engine API call # import all of its Python modules again (which takes about 300ms). It also # resets the SD and GD dictionaries so it's not possible to cache those modules # there either. local_engine.autocommit = True with ThreadPoolExecutor(max_workers=worker_threads) as tpe: # Evaluate the results so that exceptions thrown by the downloader get raised pbar = tqdm( tpe.map(_do_download, zip(object_ids, urls)), total=len(objects), unit="obj", ascii=SG_CMD_ASCII, ) for object_id in pbar: if object_id: successful.append(object_id) pbar.set_postfix(object=object_id[:10] + "...") if len(successful) < len(object_ids): raise IncompleteObjectDownloadError( reason=None, successful_objects=successful) return successful except KeyboardInterrupt as e: raise IncompleteObjectDownloadError(reason=e, successful_objects=successful) finally: # Flip the engine back and close all but one pool connection. local_engine.autocommit = False local_engine.close_others()
def _parse_paths_overrides( lookup_path: str, override_path: str ) -> Tuple[List[str], Dict[str, str]]: return ( lookup_path.split(",") if lookup_path else [], {r[: r.index(":")]: r[r.index(":") + 1 :] for r in override_path.split(",")} if override_path else {}, ) # Parse and set these on import. If we ever need to be able to reread the config on the fly, these have to be # recalculated. _LOOKUP_PATH, _LOOKUP_PATH_OVERRIDE = _parse_paths_overrides( get_singleton(CONFIG, "SG_REPO_LOOKUP"), get_singleton(CONFIG, "SG_REPO_LOOKUP_OVERRIDE") ) def init_engine(skip_object_handling: bool = False) -> None: # pragma: no cover # Method exercised in test_commandline.test_init_new_db but in # an external process """ Initializes the engine by: * performing any required engine-custom initialization * creating the metadata tables :param skip_object_handling: If True, skips installing routines related to object handling and checkouts (like audit triggers and CStore management). """