def display_tree( self, line: str, local_ns: Dict[str, Dict[str, Any]]) -> IPython.core.display.JSON: """ Parameters ---------- line : str Dataset name with data to display local_ns : str The namespace to load into IPython user namespace Returns ---------- tree: IPython.core.display.JSON An IPython JSON display representing the data. Example ---------- >>> %display_tree dataset """ lineArgs = line.split(" ") dataset = local_ns[lineArgs[0]] return display_json(dataset)
def getCatalog(self, database: Optional[str] = None ) -> IPython.core.display.JSON: """ Get Data Catalog of a specific Database Parameters ---------- database : str Name of database to catalog. Returns ------- schema : IPython.core.display.JSON An IPython JSON display representing the schema metadata for a database Example -------- >>> from aws.utils.notebooks.database import AthenaUtils >>> from aws.utils.notebooks.json import display_json >>> AthenaUtils.getCatalog(database="my_database") """ glue = boto3.client("glue") schemas = dict() response = glue.get_tables(DatabaseName=database, MaxResults=1000) for t in response["TableList"]: table = dict() schemas[t["Name"]] = table table["name"] = t["Name"] table["location"] = t["StorageDescriptor"]["Location"] table["cols"] = dict() for c in t["StorageDescriptor"]["Columns"]: col = dict() table["cols"][c["Name"]] = col col["name"] = c["Name"] col["type"] = c["Type"] return display_json(schemas, root="glue databases")
def getCatalog( self, schema_name: Optional[str] = None, table_name: Optional[str] = None) -> IPython.core.display.JSON: """ Get Glue Catalog metadata of a specific Database table. Parameters ---------- schema_name : str, optional Name of schema to retrieve Glue Catalog for (default looks at all schema with give tablenames in the current database) table_name : str, optional Name of table to retrieve Glue Catalog for (default looks at all tables with give schemanames in the current database) Returns ------- schema : IPython.core.display.JSON An IPython JSON display representing the schema metadata for the table(s) / database Example -------- >>> from aws.utils.notebooks.database import RedshiftUtils >>> from aws.utils.notebooks.json import display_json >>> RedshiftUtils.getCatalog(schema_name="my_schema",table_name="table1") """ glue = boto3.client("glue") s3 = boto3.client("s3") sql = """ SELECT cols.schemaname, cols.tablename, cols.columnname, cols.external_type, cols.columnnum, tables.location, schemas.databasename, tables.parameters FROM SVV_EXTERNAL_COLUMNS cols natural join SVV_EXTERNAL_TABLES tables natural join SVV_EXTERNAL_SCHEMAS schemas """ if schema_name or table_name: sql += "WHERE " if schema_name and table_name: sql += f"schemaname = '{schema_name}' AND tablename = '{table_name}'" elif schema_name: sql += f"schemaname = '{schema_name}'" elif table_name: sql += f"tablename = '{table_name}'" sql += "\n order by schemas.schemaname, tables.tablename, columnnum\n" rs = self.current_engine.execute(sql) res = rs.fetchall() if len(res) == 0: print("no external schema or tables found") return df = DataFrame(res) df.columns = rs.keys() schemas = dict() for row in df.itertuples(): if row[1] not in schemas: schemas[row[1]] = dict() if row[2] not in schemas[row[1]]: schemas[row[1]][row[2]] = dict() table = schemas[row[1]][row[2]] table["name"] = row[2] table["location"] = row[6] table["cols"] = dict() self._add_json_schema(s3, glue, table, row[7], row[2]) table = schemas[row[1]][row[2]] colInfo = dict() colInfo["name"] = row[3] colInfo["type"] = row[4] colInfo["order"] = row[5] table["cols"][row[3]] = colInfo return display_json(schemas, root="glue databases")
def create_external_table( self, select: str, database_name: str, table_name: str, format: Optional[str] = "Parquet", s3_location: Optional[str] = None, options: Optional[str] = "", ) -> IPython.core.display.JSON: """ Creates a new external table in the given database and runs a glue crawler to populate glue catalog tables with table metadata. Note ---- This method function uses Amazon Redshift UNLOAD which splits the results of a select statement across a set of files, one or more files per node slice, to simplify parallel reloading of the data. Parameters ---------- select : str SQL ddl statement for data to select (e.g. 'select * from table'). database_name : str Name of database with existing data and schema. table_name : str The name of the table to be created. format : str, optional The file format for data files (default Parquet format). s3_location : str, optional The path to the Amazon S3 bucket or folder that contains the data files or a manifest file that contains a list of Amazon S3 object paths (used only if no database has no location). options : str, optional Specify additional table properties for when creating new table. Returns ------- s : IPython.core.display.JSON An IPython JSON display representing the schema metadata for the table(s) / database Example -------- >>> from aws.utils.notebooks.database import RedshiftUtils >>> from aws.utils.notebooks.json import display_json >>> RedshiftUtils.create_external_table( ... select = 'select * from table1', ... database_name = 'my_database', ... table_name = 'myQuery1', ... s3_location = 's3://bucketname/folder/' ... ) """ glue = boto3.client("glue") target_s3 = glue.get_database( Name=database_name)["Database"]["LocationUri"] s3 = boto3.client("s3") if target_s3 == None or len(target_s3) == 0: if s3_location != None: target_s3 = s3_location else: raise Exception( "Database does not have a location , and location was not provided" ) bucket_name = target_s3[5:].split("/")[0] db_path = "/".join(target_s3[5:].split("/")[1:]) s3_path = f"{target_s3}/{table_name}/" prefix = f"{db_path}/{table_name}/" response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix) if "Contents" in response: for object in response["Contents"]: logger.info("Deleting {object['Key']}") s3.delete_object(Bucket=bucket_name, Key=object["Key"]) ddl = f""" UNLOAD ('{select}') TO '{s3_path}' iam_role '{self.redshift_role}' FORMAT AS {format} {options} """ self.execute_ddl(ddl) logger.info("Query result s3 write complete.") run_crawler(f"crawler_for_{database_name}_{table_name}", database_name, s3_path) response = glue.get_table(DatabaseName=database_name, Name=table_name) recordCount = response["Table"]["Parameters"]["recordCount"] schema = response["Table"]["StorageDescriptor"]["Columns"] location = response["Table"]["StorageDescriptor"]["Location"] data_format = response["Table"]["Parameters"]["classification"] logger.info( f"Table {table_name} created: records: {recordCount}, format: {data_format}, location: {location}" ) s = {} for c in schema: s[c["Name"]] = c["Type"] return display_json(s, root="cols")