Example #1
0
def test_lake_creator_setup(kube: TestClient):
    workspace = get_workspace()
    notebook_bucket = workspace['ScratchBucket']
    env_name = workspace['env_name']
    # Locate Bucket Paths
    demo_config = get_demo_configuration(env_name)
    lake_bucket = demo_config.get("LakeBucket").split(':::')[1]
    users_bucket = notebook_bucket.split("/")[2]
    logger.info(f"lake_bucket={lake_bucket}, users_bucket={users_bucket}")
    # Create Databases
    database_name = f"cms_raw_db_{env_name}".replace(
        '-', '_')  # athena doesnt support '-' in db or table name
    create_db(database_name, lake_bucket, 'lake: claims data from cms')
    assert check_database_exists(database_name) == database_name
    create_db('default', lake_bucket)
    assert check_database_exists('default') == 'default'
    create_db('users', users_bucket)
    assert check_database_exists('users') == 'users'
    # Get orbit job parameters
    location = GLUE.get_database(Name=database_name)['Database']['LocationUri']
    bucket = location[5:].split('/')[0]
    logger.info(f"bucket={bucket}, location={location}")
    extractedPrefix = "extracted/"
    # S3 Clean Up
    clean_bucket_prefix(bucket, extractedPrefix)
    assert len(get_s3_extracted_files(bucket, extractedPrefix)) == 0
    #sh.run("rm -f /home/jovyan/shared/regression/CREATOR_PASSED")
    clean_bucket_prefix(bucket, database_name)
    assert len(get_s3_extracted_files(bucket, database_name)) == 0
def get_lake_creator_list_of_files():
    orbit_workspace = get_workspace()
    env_name = orbit_workspace['env_name']

    notebooks_run_config = {
        # a list of notebooks names to skip the execution for. Example: ["Example-7-Data-Profiling"]
        "exclusion_list": ['Example-3-Ray Job Example', 'Example-4-Ray Tune Example', 'Example-92-Delete-DemoCronJobs',
                       'Example-1-simple', 'Example-2-spark', 'Example-3-gpu', 'Example-90-Failure-Behavior',
                       'Example-6-Schedule-Notebook','Example-8-SDK-Controller-Sched'],
        "inclusion_list": [],  # if not empty, only those will run. Example: ["Example-7-Data-Profiling"]
        "optional_list": [],
        # indicates to ignore a failure. Example: ["Example-6-Schedule-Notebook", "Example-8-LakeFormation-Security"]
        "minimum_successful": 1,
        # number of minimum notebooks to be completed to consider entire test not failed
        # (has an effect when this number is larger than number of mandatory )
        "maxRetries": 3,  # max number of attempts to execute a notebook
        "notebooks_to_run": [],  # all noootebooks for execution.
        "sagemaker_notebooks_list": ["Example-1-xgboost_mnist",
                                     "Example-2-SageMaker-Batch Transform - breast cancer prediction with high level SDK",
                                     "Example-5-SageMaker-on-EKS-xgboost_mnist"]
        # sagemaker notebooks with small profile
    }

    # If we are running in an isolated env, here is your exclusion_list addition
    if env_name.endswith('-iso'):
        notebooks_run_config["exclusion_list"].append('Example-91-LakeFormation-Security')
        notebooks_run_config["exclusion_list"].append('Example-5-SageMaker-on-EKS-xgboost_mnist')

    sample_notebooks_path = "../../samples/notebooks"
    analyst_folders = ["B-DataAnalyst", "I-Image", "H-Model-Development"]
    notebook_file_path = []
    # List specific folders for analyst notebooks
    for folder in analyst_folders:
        logger.info(f"Reading folder={folder}")
        notebooks = [str(nb) for nb in Path(f"{sample_notebooks_path}/{folder}").glob("*.ipynb")]
        notebook_file_path += notebooks
    sorted_notebook_paths = sorted(notebook_file_path)

    for p in sorted_notebook_paths:
        parts = p.split('/')
        nb_file_name, nb_folder = parts[-1], parts[-2]
        nb_name= nb_file_name.split('.')[0]
        logger.info(f"nb_folder={nb_folder}/nb_name={nb_name}")
        if nb_name.split('.')[0] in notebooks_run_config["exclusion_list"]:
            # ignore inclusion_list. exclusion_list is having highest priority for filters
            logger.info(f"Ignoring notebook={nb_name}")
            continue
        if not notebooks_run_config["inclusion_list"] or nb_name in notebooks_run_config["inclusion_list"]:
            # run notebook if white list is empty or if the notebook is in white list.
            if nb_folder in ["H-Model-Development"]:
                notebooks_run_config["notebooks_to_run"].append(
                    {"folder": nb_folder, "name": nb_file_name, "profile": "small"})
            else:
                notebooks_run_config["notebooks_to_run"].append({"folder": nb_folder, "name": nb_file_name})

    return notebooks_run_config
def update_teamspace_lakeformation_permissions(
        db_name: Optional[str] = "*") -> None:
    """
    This call will perform a scan over the provided database's tables. Base on the security selector for the given
    Team Space and base on the current column tags, the permissions will be update to allow access for permitted
    columns.

    Parameters
    -----------
    db_name: optional, str
        Name of the database for which to update permissions.

    Returns
    -------
    None
        None.

    Example
    --------
    >>> import aws_orbit_sdk.glue_catalog as glue
    >>> glue.update_teamspace_lakeformation_permissions(database_name)
    """
    workspace = get_workspace()
    lambda_client = boto3.client("lambda")

    inp = {
        "env_name": workspace["env_name"],
        "team_space": workspace["team_space"],
        "db_name": db_name,
        "role_arn": workspace["EksPodRoleArn"],
    }
    payload = json.dumps(inp)
    response = lambda_client.invoke(
        FunctionName=
        f"orbit-{workspace['env_name']}-authorize_lake_formation_for_role",
        InvocationType="RequestResponse",
        LogType="Tail",
        Payload=bytes(payload, "utf-8"),
    )

    if response["ResponseMetadata"]["HTTPStatusCode"] == 200:
        response_payload = json.loads(
            response["Payload"].read().decode("utf-8"))
        if "errorMessage" in response_payload:
            raise Exception(response_payload["errorMessage"])

    print("Lakeformation permissions have been updated")
Example #4
0
 def get_sample_data(self, database: str, table: str, sample: int,
                     field: str, direction: str):
     workspace = get_workspace()
     logger.info(
         f"query staging location: {workspace['ScratchBucket']}/athena/query/"
     )
     conn = pyathena.connect(
         s3_staging_dir=f"{workspace['ScratchBucket']}/athena/query/",
         region_name=workspace["region"],
     )
     if field and len(field) > 0:
         query = f'SELECT * FROM "{database}"."{table}" order by {field} desc LIMIT {sample}'
     else:
         query = f'SELECT * FROM "{database}"."{table}" LIMIT {sample}'
     df = pd.read_sql(query, conn)
     result = df.to_json(orient="records")
     return result
Example #5
0
    def get(self):
        global DATA
        self.log.info(f"GET - {self.__class__}")
        if "MOCK" not in os.environ or os.environ["MOCK"] == "0":
            DATA = get_workspace()
            cluster_name = "orbit-" + DATA["env_name"]
            eks_nodegroups = controller.get_nodegroups(cluster_name=cluster_name)
            self.log.debug(f"eks_nodegroups={eks_nodegroups}")
            if "MOCK" in os.environ:
                path = f"{Path(__file__).parent.parent.parent}/test/mockup/compute-eks.json"
                self.log.info(f"writing mockup data to {path}")
                with open(path, "w") as outfile:
                    json.dump(eks_nodegroups, outfile, indent=4)
        else:
            path = f"{Path(__file__).parent.parent.parent}/test/mockup/compute-eks.json"
            with open(path) as f:
                eks_nodegroups = json.load(f)

        self.finish(self._dump(eks_nodegroups))
    def get(self):
        global DATA
        self.log.info(f"GET - {self.__class__}")
        if "MOCK" not in os.environ or os.environ["MOCK"] == "0":
            DATA = get_workspace()
            # hide some details
            if "Elbs" in DATA:
                del DATA["Elbs"]
            if "Plugins" in DATA:
                del DATA["Plugins"]

            if "MOCK" in os.environ:
                path = f"{Path(__file__).parent.parent.parent}/test/mockup/team.json"
                self.log.info(f"writing mockup data to {path}")
                with open(path, "w") as outfile:
                    json.dump(DATA, outfile, indent=4)
        else:
            path = f"{Path(__file__).parent.parent.parent}/test/mockup/team.json"
            with open(path) as f:
                DATA = json.load(f)

        self.finish(self._dump(DATA))
# Initialize parameters
logging.basicConfig(
    format="%(asctime)s %(levelname)-8s %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S",
)

logger = logging.getLogger()
glue = boto3.client("glue")
sns = boto3.client("sns")
s3 = boto3.client("s3")

# Adding output path and other parameters for the reports
notebook_name = "Automated-Data-Transformations.ipynb"
workspace = get_workspace()
team_space = workspace["team_space"]
env_name = workspace["env_name"]
source_path = "$ORBIT_TRANSFORMATION_NOTEBOOKS_ROOT"
base_path = "orbit/profiling"
logger.info(f"Team space: {team_space}, Environment name: {env_name}")


def create_tasks(glue_tables: Dict[str, Any], target_folder: str,
                 database: str, samplingRatio: float) -> List[Dict[str, Any]]:
    """
    Creating a data profiling task for each Glue table in the database.

    Parameters
    ------------
    glue_tables : list
def run_crawler(crawler: str,
                target_db: str,
                target_path: str,
                wait: Optional[Any] = True) -> str:
    """
    This API starts a glue crawler for the given path and will create the tables base on data found in the provided
    database. The call can wait until the crawler is done and table created.

    Parameters
    ----------
    crawler: str
        A unique name of the Crawler
    target_db: str
        The name of the target database
    target_path: str
        The S3 Path where the data for the table resides.
    wait: optional, bool
        If True, will wait until the Crawler is finished.

    Returns
    -------
    state: str
        Returns the state of the crawler after finished running and creating tables.
    Example
    -------
    >>> import aws_orbit_sdk.glue_catalog as glue
    >>> response = glue.run_crawler(crawler, target_db, target_path, wait=True)
    """
    role = get_workspace()["EksPodRoleArn"]
    glue = boto3.client("glue")
    try:
        glue.delete_crawler(Name=crawler)
        logger.info("existing crawler deleted")
    except Exception as e:
        error = str(e)
        if "EntityNotFoundException" not in error:
            logger.error(error)
        pass

    response = glue.create_crawler(
        Name=crawler,
        Role=role,
        DatabaseName=target_db,
        Targets={"S3Targets": [{
            "Path": target_path
        }]},
    )
    state = response["ResponseMetadata"]["HTTPStatusCode"]
    if state != 200:
        raise Exception("Failed to create crawler")

    glue.start_crawler(Name=crawler)

    logger.info("Crawler started...")
    state = "INIT"
    while state != "READY":
        response = glue.get_crawler(Name=crawler)
        state = response["Crawler"]["State"]
        if not wait:
            return state
        logger.info(f"Crawler in state: {state}, waiting a min... ")
        time.sleep(60)

    response = glue.get_crawler_metrics(CrawlerNameList=[crawler])
    if "CrawlerMetricsList" not in response or "TablesCreated" not in response[
            "CrawlerMetricsList"][0]:
        raise Exception("Crawler failed to create table")

    stats = response["CrawlerMetricsList"][0]

    logger.info(stats)

    logger.info("Crawler finished creating table")
    return state
Example #9
0
    def get_connection_to_athena(
        self,
        DbName: str,
        region_name: Optional[str] = None,
        S3QueryResultsLocation: Optional[str] = None,
    ) -> Dict[str, Union[str, sa.engine.Engine]]:
        """
        Connect Athena to an existing database

        Parameters
        ----------
        DbName : str
            Name of the glue database name.

        region_name : str, optional
            The region to connect to athena. The default region will be used if receives None.

        S3QueryResultsLocation : str, optional
            The s3 bucket where to store query results. The results will not be saved if received None.


        Returns
        -------
        db_url : str
            A sql alchemy connection string.
        engine : sqlalchemy.engine.Engine
            A sql alchemy engine.

        Example
        --------
        >>> from aws.utils.notebooks.database import AthenaUtils
        >>> from sqlalchemy.engine import create_engine
        >>> from aws.utils.notebooks.common import get_workspace
        >>> (db_url,engine) = AthenaUtils.get_connection_to_athena(
        ...     DbName = glue_db,
        ...     my_region = my_region,
        ...     S3QueryResultsLocation = results_location)
        """

        workspace = get_workspace()
        if region_name is None:
            region_name = workspace["region"]

        if S3QueryResultsLocation is None:
            S3QueryResultsLocation = f"{workspace['ScratchBucket']}/athena"

        template_con_str = (
            "awsathena+rest://athena.{region_name}.amazonaws.com:443/"
            "{schema_name}?s3_staging_dir={s3_staging_dir}")
        conn_str = template_con_str.format(
            region_name=region_name,
            schema_name=DbName,
            s3_staging_dir=quote_plus(S3QueryResultsLocation),
        )

        engine = create_engine(conn_str)
        self.db_url = conn_str
        self.current_engine = engine
        self.db_class = "athena"
        return {
            "db_url": self.db_url,
            "engine": self.current_engine,
        }