def _create_foreign_tables(self, schema: str, server_id: str, tables: TableInfo) -> List[MountError]: from sodapy import Socrata from psycopg2.sql import SQL logging.info("Getting Socrata metadata") client = Socrata(domain=self.params["domain"], app_token=self.credentials.get("app_token")) tables = self.tables or tables if isinstance(tables, list): sought_ids = tables else: sought_ids = [t[1]["socrata_id"] for t in tables.values()] try: datasets = client.datasets(ids=sought_ids, only=["dataset"]) except Exception as e: if "Unknown response format: text/html" in str(e): # If the Socrata dataset/domain isn't found, sodapy doesn't catch it directly # and instead stumbles on an unexpected content-type of the 404 page it's served. # We catch that and reraise a more friendly message. raise RepositoryNotFoundError( "Socrata domain or dataset not found!") from e raise if not datasets: raise RepositoryNotFoundError( "Socrata domain or dataset not found!") mount_statements, mount_args = generate_socrata_mount_queries( sought_ids, datasets, schema, server_id, tables) self.engine.run_sql(SQL(";").join(mount_statements), mount_args) return []
def lookup_repository(name: str, include_local: bool = False) -> "Repository": """ Queries the SG engines on the lookup path to locate one hosting the given repository. :param name: Repository name :param include_local: If True, also queries the local engine :return: Local or remote Repository object """ from splitgraph.core.repository import Repository template = Repository.from_schema(name) if name in _LOOKUP_PATH_OVERRIDE: return Repository( template.namespace, template.repository, get_engine(_LOOKUP_PATH_OVERRIDE[name]) ) # Currently just check if the schema with that name exists on the remote. if include_local and repository_exists(template): return template for engine in _LOOKUP_PATH: candidate = Repository(template.namespace, template.repository, get_engine(engine)) if repository_exists(candidate): return candidate candidate.engine.close() raise RepositoryNotFoundError("Unknown repository %s!" % name)
def convert(self, value: str, param: Optional[Parameter], ctx: Optional[Context]) -> "Repository": from splitgraph.core.repository import Repository result = Repository.from_schema(value) if self.exists: from splitgraph.core.engine import repository_exists if not repository_exists(result): raise RepositoryNotFoundError("Unknown repository %s" % result) return result
def by_tag(self, tag: str, raise_on_none: bool = True) -> Optional[Image]: """ Returns an image with a given tag :param tag: Tag. 'latest' is a special case: it returns the most recent image in the repository. :param raise_on_none: Whether to raise an error or return None if the tag doesn't exist. """ engine = self.engine if not repository_exists(self.repository): raise RepositoryNotFoundError("Unknown repository %s!" % str(self.repository)) if tag == "latest": # Special case, return the latest commit from the repository. result = self.engine.run_sql( select( "get_images", ",".join(IMAGE_COLS), schema=SPLITGRAPH_API_SCHEMA, table_args="(%s,%s)", ) + SQL(" ORDER BY created DESC LIMIT 1"), (self.repository.namespace, self.repository.repository), return_shape=ResultShape.ONE_MANY, ) if result is None: raise ImageNotFoundError("No images found in %s!" % self.repository.to_schema()) return self._make_image(result) result = engine.run_sql( select( "get_tagged_images", "image_hash", "tag = %s", schema=SPLITGRAPH_API_SCHEMA, table_args="(%s,%s)", ), (self.repository.namespace, self.repository.repository, tag), return_shape=ResultShape.ONE_ONE, ) if result is None: if raise_on_none: schema = self.repository.to_schema() if tag == "HEAD": raise ImageNotFoundError( 'No current checked out revision found for %s. Check one out with "sgr ' 'checkout %s:image_hash".' % (schema, schema)) raise ImageNotFoundError("Tag %s not found in repository %s" % (tag, schema)) return None return self.by_hash(result)
def convert( self, value: str, param: Optional[Parameter], ctx: Optional[Context] ) -> Tuple["Repository", Optional[Union["Image", str]]]: """ Image specification must have the format [NAMESPACE/]REPOSITORY[:HASH_OR_TAG]. The parser returns a tuple of (repository object, tag or hash). """ from splitgraph.core.output import parse_repo_tag_or_hash repo, tag_or_hash = parse_repo_tag_or_hash(value, default=self.default) if self.get_image or self.repository_exists: # Check image/repo exists if we're asked (or if we need to produce # an actual Image object) from splitgraph.core.engine import repository_exists if not repository_exists(repo): raise RepositoryNotFoundError("Unknown repository %s" % repo) if tag_or_hash is not None and self.get_image: return repo, repo.images[tag_or_hash] else: return repo, tag_or_hash
def mount_socrata( mountpoint: str, server, port, username, password, domain: str, tables: Optional[Dict[str, Any]] = None, app_token: Optional[str] = None, batch_size: Optional[int] = 10000, ) -> None: """ Mount a Socrata dataset. Mounts a remote Socrata dataset and forwards queries to it \b :param domain: Socrata domain, for example, data.albanyny.gov. Required. :param tables: A dictionary mapping PostgreSQL table names to Socrata table IDs. For example, {"salaries": "xzkq-xp2w"}. If skipped, ALL tables in the Socrata endpoint will be mounted. :param app_token: Socrata app token. Optional. :param batch_size: Amount of rows to fetch from Socrata per request (limit parameter). Maximum 50000. """ from splitgraph.engine import get_engine from sodapy import Socrata from psycopg2.sql import Identifier, SQL engine = get_engine() logging.info("Mounting Socrata domain...") server_id = mountpoint + "_server" options: Dict[str, Optional[str]] = { "wrapper": "splitgraph.ingestion.socrata.fdw.SocrataForeignDataWrapper", } if domain: options["domain"] = domain if app_token: options["app_token"] = app_token if batch_size: options["batch_size"] = str(batch_size) init_fdw( engine, server_id=server_id, wrapper="multicorn", server_options=options, ) engine.run_sql(SQL("CREATE SCHEMA IF NOT EXISTS {}").format(Identifier(mountpoint))) logging.info("Getting Socrata metadata") client = Socrata(domain=domain, app_token=app_token) sought_ids = tables.values() if tables else [] try: datasets = client.datasets(ids=sought_ids, only=["dataset"]) except Exception as e: if "Unknown response format: text/html" in str(e): # If the Socrata dataset/domain isn't found, sodapy doesn't catch it directly # and instead stumbles on an unexpected content-type of the 404 page it's served. # We catch that and reraise a more friendly message. raise RepositoryNotFoundError("Socrata domain or dataset not found!") from e mount_statements, mount_args = generate_socrata_mount_queries( sought_ids, datasets, mountpoint, server_id, tables ) engine.run_sql(SQL(";").join(mount_statements), mount_args)