Ejemplo n.º 1
0
    def display_tree(
            self, line: str,
            local_ns: Dict[str, Dict[str, Any]]) -> IPython.core.display.JSON:
        """

        Parameters
        ----------
        line : str
            Dataset name with data to display
        local_ns : str
            The namespace to load into IPython user namespace

        Returns
        ----------
        tree: IPython.core.display.JSON
        An IPython JSON display representing the data.

        Example
        ----------
        >>> %display_tree dataset
        """

        lineArgs = line.split(" ")
        dataset = local_ns[lineArgs[0]]
        return display_json(dataset)
Ejemplo n.º 2
0
    def getCatalog(self,
                   database: Optional[str] = None
                   ) -> IPython.core.display.JSON:
        """
        Get Data Catalog of a specific Database

        Parameters
        ----------
        database : str
            Name of database to catalog.

        Returns
        -------
         schema : IPython.core.display.JSON
            An IPython JSON display representing the schema metadata for a database

        Example
        --------
        >>> from aws.utils.notebooks.database import AthenaUtils
        >>> from aws.utils.notebooks.json import display_json
        >>> AthenaUtils.getCatalog(database="my_database")
        """

        glue = boto3.client("glue")
        schemas = dict()
        response = glue.get_tables(DatabaseName=database, MaxResults=1000)
        for t in response["TableList"]:
            table = dict()
            schemas[t["Name"]] = table
            table["name"] = t["Name"]
            table["location"] = t["StorageDescriptor"]["Location"]
            table["cols"] = dict()
            for c in t["StorageDescriptor"]["Columns"]:
                col = dict()
                table["cols"][c["Name"]] = col
                col["name"] = c["Name"]
                col["type"] = c["Type"]

        return display_json(schemas, root="glue databases")
Ejemplo n.º 3
0
    def getCatalog(
            self,
            schema_name: Optional[str] = None,
            table_name: Optional[str] = None) -> IPython.core.display.JSON:
        """
        Get Glue Catalog metadata of a specific Database table.

        Parameters
        ----------
        schema_name : str, optional
            Name of schema to retrieve Glue Catalog for (default looks at all schema with give tablenames in the
            current database)

        table_name : str, optional
            Name of table to retrieve Glue Catalog for (default looks at all tables with give schemanames in the
            current database)


        Returns
        -------
        schema : IPython.core.display.JSON
            An IPython JSON display representing the schema metadata for the table(s) / database

        Example
        --------
        >>> from aws.utils.notebooks.database import RedshiftUtils
        >>> from aws.utils.notebooks.json import display_json
        >>> RedshiftUtils.getCatalog(schema_name="my_schema",table_name="table1")
        """

        glue = boto3.client("glue")
        s3 = boto3.client("s3")
        sql = """
                    SELECT cols.schemaname, cols.tablename, cols.columnname, cols.external_type, cols.columnnum, tables.location, schemas.databasename, tables.parameters
                     FROM SVV_EXTERNAL_COLUMNS cols natural join SVV_EXTERNAL_TABLES tables natural join SVV_EXTERNAL_SCHEMAS schemas
                """

        if schema_name or table_name:
            sql += "WHERE "
            if schema_name and table_name:
                sql += f"schemaname = '{schema_name}' AND tablename = '{table_name}'"
            elif schema_name:
                sql += f"schemaname = '{schema_name}'"
            elif table_name:
                sql += f"tablename = '{table_name}'"

        sql += "\n order by schemas.schemaname, tables.tablename, columnnum\n"

        rs = self.current_engine.execute(sql)
        res = rs.fetchall()
        if len(res) == 0:
            print("no external schema or tables found")
            return

        df = DataFrame(res)
        df.columns = rs.keys()
        schemas = dict()
        for row in df.itertuples():
            if row[1] not in schemas:
                schemas[row[1]] = dict()
            if row[2] not in schemas[row[1]]:
                schemas[row[1]][row[2]] = dict()
                table = schemas[row[1]][row[2]]
                table["name"] = row[2]
                table["location"] = row[6]
                table["cols"] = dict()
                self._add_json_schema(s3, glue, table, row[7], row[2])
            table = schemas[row[1]][row[2]]
            colInfo = dict()
            colInfo["name"] = row[3]
            colInfo["type"] = row[4]
            colInfo["order"] = row[5]
            table["cols"][row[3]] = colInfo

        return display_json(schemas, root="glue databases")
Ejemplo n.º 4
0
    def create_external_table(
        self,
        select: str,
        database_name: str,
        table_name: str,
        format: Optional[str] = "Parquet",
        s3_location: Optional[str] = None,
        options: Optional[str] = "",
    ) -> IPython.core.display.JSON:
        """
        Creates a new external table in the given database and runs a glue crawler to populate glue catalog tables with
        table metadata.

        Note
        ----
        This method function uses Amazon Redshift UNLOAD which splits the results of a select statement across a set of
        files, one or more files per node slice, to simplify parallel reloading of the data.

        Parameters
        ----------
        select : str
            SQL ddl statement for data to select (e.g. 'select * from table').
        database_name : str
            Name of database with existing data and schema.
        table_name : str
            The name of the table to be created.
        format : str, optional
            The file format for data files (default Parquet format).
        s3_location : str, optional
            The path to the Amazon S3 bucket or folder that contains the data files or a manifest file that contains a
            list of Amazon S3 object paths (used only if no database has no location).
        options : str, optional
            Specify additional table properties for when creating new table.

        Returns
        -------
        s : IPython.core.display.JSON
            An IPython JSON display representing the schema metadata for the table(s) / database

        Example
        --------
        >>> from aws.utils.notebooks.database import RedshiftUtils
        >>> from aws.utils.notebooks.json import display_json
        >>> RedshiftUtils.create_external_table(
        ...     select = 'select * from table1',
        ...     database_name = 'my_database',
        ...     table_name = 'myQuery1',
        ...     s3_location = 's3://bucketname/folder/'
        ... )
        """
        glue = boto3.client("glue")
        target_s3 = glue.get_database(
            Name=database_name)["Database"]["LocationUri"]
        s3 = boto3.client("s3")

        if target_s3 == None or len(target_s3) == 0:
            if s3_location != None:
                target_s3 = s3_location
            else:
                raise Exception(
                    "Database does not have a location , and location was not provided"
                )
        bucket_name = target_s3[5:].split("/")[0]
        db_path = "/".join(target_s3[5:].split("/")[1:])
        s3_path = f"{target_s3}/{table_name}/"
        prefix = f"{db_path}/{table_name}/"
        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
        if "Contents" in response:
            for object in response["Contents"]:
                logger.info("Deleting {object['Key']}")
                s3.delete_object(Bucket=bucket_name, Key=object["Key"])

        ddl = f"""
            UNLOAD ('{select}')
                    TO '{s3_path}' iam_role '{self.redshift_role}'
                    FORMAT AS {format}
                    {options}
        """

        self.execute_ddl(ddl)

        logger.info("Query result s3 write complete.")

        run_crawler(f"crawler_for_{database_name}_{table_name}", database_name,
                    s3_path)

        response = glue.get_table(DatabaseName=database_name, Name=table_name)

        recordCount = response["Table"]["Parameters"]["recordCount"]
        schema = response["Table"]["StorageDescriptor"]["Columns"]
        location = response["Table"]["StorageDescriptor"]["Location"]
        data_format = response["Table"]["Parameters"]["classification"]
        logger.info(
            f"Table {table_name} created: records: {recordCount}, format: {data_format}, location: {location}"
        )
        s = {}
        for c in schema:
            s[c["Name"]] = c["Type"]
        return display_json(s, root="cols")