def __init__(self, scope: core.Construct, construct_id: str,
                 **kwargs) -> None:
        super().__init__(scope, construct_id, **kwargs)

        # The code that defines your stack goes here
        glue_trigger = glue.CfnTrigger(self,
                                       "gluetrigger",
                                       name="etl-trigger",
                                       type="ON_DEMAND",
                                       schedule=None,
                                       actions=[{
                                           "jobName": "glue_crawler"
                                       }])

        glue_crawler = glue.CfnCrawler(
            self,
            'glue-crawler-id',
            description="Glue Crawler for my-data-science-s3",
            name='nickcrawler',
            database_name='nike',
            schedule=None,
            role=
            'arn:aws:iam::919238404395:role/service-role/AWSGlueServiceRole-my_2nd_iamrole',
            targets={
                "s3Targets": [{
                    "path": "s3://nikhils3/file/Titanic.csv"
                }]
            })

        glue_trigger.add_depends_on(glue_crawler)
Beispiel #2
0
    def __init__(
        self,
        scope: cdk.Construct,
        construct_id: str,
        stack_log_level: str,
        vpc,
        my_sql_db_sg,
        store_events_db_endpoint,
        sales_events_bkt,
        _glue_etl_role,
        glue_db_name: str,
        glue_table_name: str,
        tgt_db_secret,
        **kwargs,
    ) -> None:
        super().__init__(scope, construct_id, **kwargs)

        self.template_options.metadata = {"License": "Miztiik Corp."}

        # ADD Permissions to our Glue JOB Role to Access Secrets
        tgt_db_secret.grant_read(_glue_etl_role)

        # # Create GLUE JDBC Connection for RDS MySQL

        # Allow ALL PORTS within SG for GLUE Connections to connect
        # https://docs.aws.amazon.com/glue/latest/dg/connection-defining.html#connection-properties-jdbc
        # https://docs.aws.amazon.com/glue/latest/dg/setup-vpc-for-glue-access.html
        # https://docs.amazonaws.cn/en_us/glue/latest/dg/connection-defining.html

        rds_mysql_conn_props = _glue.CfnConnection.ConnectionInputProperty(
            connection_type="JDBC",
            description="Glue Connection for RDS MySQL Store Events Database",
            name="rdsMySQL57Conn",
            physical_connection_requirements=_glue.CfnConnection.PhysicalConnectionRequirementsProperty(
                security_group_id_list=[my_sql_db_sg.security_group_id],
                subnet_id=vpc.select_subnets(
                        subnet_type=_ec2.SubnetType.PRIVATE
                ).subnet_ids[1]
            ),
            connection_properties={
                "JDBC_CONNECTION_URL": f"jdbc:mysql://{store_events_db_endpoint}:3306/store_events",
                "JDBC_ENFORCE_SSL": "false",
                "USERNAME": "******",
                "PASSWORD": "******"
            }
        )

        rds_mysql_conn = _glue.CfnConnection(
            self,
            "rdsMySQLGlueConnection",
            catalog_id=f"{cdk.Aws.ACCOUNT_ID}",
            connection_input=rds_mysql_conn_props
        )

        # Create the Glue job to convert incoming JSON to parquet
        # Read GlueSpark Code
        try:
            with open(
                "stacks/back_end/glue_stacks/glue_job_scripts/load_json_to_rds.py",
                encoding="utf-8",
                mode="r",
            ) as f:
                load_json_to_rds = f.read()
        except OSError:
            print("Unable to read Glue Job Code")
            raise

        etl_script_asset = _s3_assets.Asset(
            self,
            "etlScriptAsset",
            path="stacks/back_end/glue_stacks/glue_job_scripts/load_json_to_rds.py"
        )

        self.etl_prefix = "stream-etl"
        _glue_etl_job = _glue.CfnJob(
            self,
            "glues3ToRdsIngestorJob",
            name="s3-to-rds-ingestor",
            description="Glue Job to ingest JSON data from S3 to RDS",
            role=_glue_etl_role.role_arn,
            glue_version="2.0",
            command=_glue.CfnJob.JobCommandProperty(
                name="glueetl",
                script_location=f"s3://{etl_script_asset.s3_bucket_name}/{etl_script_asset.s3_object_key}",
                python_version="3"
            ),
            connections={"connections": [rds_mysql_conn_props.name]},
            default_arguments={
                "--enable-metrics": True,
                "--enable-continuous-cloudwatch-log": True,
                "--job-bookmark-option": "job-bookmark-enable",
                '--TempDir': f"s3://{sales_events_bkt.bucket_name}/bookmarks",
                "--src_db_name": glue_db_name,
                "--src_etl_bkt": f"{sales_events_bkt.bucket_name}",
                "--crawler_tbl_prefix": "txns_",
                "--tgt_db_secret_arn": tgt_db_secret.secret_arn,
                "--tgt_tbl_name": glue_table_name,
                "--conn_name": f"{rds_mysql_conn_props.name}"
            },
            allocated_capacity=1,
            # timeout=2,
            max_retries=2,
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=2)
        )

        # Configure a Trigger - Every hour
        _glue_etl_job_trigger = _glue.CfnTrigger(
            self,
            "glueEtlJobtrigger",
            type="SCHEDULED",
            description="Miztiik Automation: Trigger S3 to RDS Ingestor glue job every hour",
            schedule="cron(0 1 * * ? *)",
            start_on_creation=False,
            actions=[
                _glue.CfnTrigger.ActionProperty(
                    job_name=f"{_glue_etl_job.name}",
                    timeout=2
                )
            ]
        )
        _glue_etl_job_trigger.add_depends_on(_glue_etl_job)

        # Configure Glue Workflow
        _glue_etl_job_workflow = _glue.CfnWorkflow(
            self,
            "glueEtlJobWorkflow"
        )

        ###########################################
        ################# OUTPUTS #################
        ###########################################
        output_0 = cdk.CfnOutput(
            self,
            "AutomationFrom",
            value=f"{GlobalArgs.SOURCE_INFO}",
            description="To know more about this automation stack, check out our github page.",
        )

        output_1 = cdk.CfnOutput(
            self,
            "RDSIngestorETLGlueJob",
            value=f"https://console.aws.amazon.com/gluestudio/home?region={cdk.Aws.REGION}#/jobs",
            description="Glue Job to ingest JSON data from S3 to RDS.",
        )
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        with open('./props/tasksetting.json', 'r') as f1:
            py_json1 = json.load(f1)
            ts = json.dumps(py_json1)

        # with open('./props/mappingrule.json', 'r') as f2:
        #     py_json2 = json.load(f2)
        #     mr = json.dumps(py_json2)

        with open('./props/config.json', 'r') as f2:
            configuration = json.load(f2)

        def getMappingrules(self, table_list):
            rules = []
            for index, value in enumerate(table_list, 1):
                rules.append({
                    "rule-type": "selection",
                    "rule-id": str(index),
                    "rule-name": str(index),
                    "object-locator": {
                        "schema-name": value['schemaName'],
                        "table-name": value['tableName']
                    },
                    "rule-action": "include",
                    "filters": []
                })
            mapping_rules = {"rules": rules}
            return json.dumps(mapping_rules)

        # The code that defines your stack goes here
        S3Accessrole = _iam.Role(
            self,
            'dmsrole',
            assumed_by=_iam.ServicePrincipal('dms.amazonaws.com'),
            managed_policies=[
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    'AmazonS3FullAccess')
            ])

        raw_bucket = s3.Bucket(self,
                               'rawbucket',
                               bucket_name='rawbucket-datalake-cdk-oregon')
        raw_bucket.add_lifecycle_rule(
            enabled=configuration['s3LifecycleRule']['enabled'],
            expiration=core.Duration.days(
                configuration['s3LifecycleRule']['expiration']))

        #my_table = ddb.Table(self, id ='dunamoTable', table_name = 'testcdktable',
        #partition_key = ddb.Attribute(name ='lastname',type = ddb.AttributeType.STRING) )

        dl_dms = _dms.CfnReplicationInstance(
            self,
            'dmsreplication',
            replication_instance_class=configuration['DMS_instance_setting']
            ['instance_class'],
            replication_instance_identifier='datalake-instance-cdk',
            allocated_storage=configuration['DMS_instance_setting']
            ['allocated_storage'])

        source_endpoint = _dms.CfnEndpoint(
            self,
            'sourceendpoint',
            endpoint_type='source',
            engine_name=configuration['engineName'],
            database_name=configuration['databaseName'],
            username=configuration['username'],
            password=configuration['password'],
            port=configuration['port'],
            server_name=configuration['serverName'],
        )

        target_endpoint = _dms.CfnEndpoint(
            self,
            'targetendpoint',
            endpoint_type='target',
            engine_name='s3',
            s3_settings={
                'bucketName': raw_bucket.bucket_name,
                'serviceAccessRoleArn': S3Accessrole.role_arn
            },
            extra_connection_attributes='dataFormat=parquet')

        dms_task = _dms.CfnReplicationTask(
            self,
            'data2lake-task',
            migration_type='full-load-and-cdc',
            replication_instance_arn=dl_dms.ref,
            source_endpoint_arn=source_endpoint.ref,
            target_endpoint_arn=target_endpoint.ref,
            replication_task_settings=ts,
            table_mappings=getMappingrules(self, configuration['tableList']))

        my_table = ddb.Table(self,
                             id='dynamoTable',
                             table_name='ControllerTable',
                             partition_key=ddb.Attribute(
                                 name='path', type=ddb.AttributeType.STRING),
                             billing_mode=ddb.BillingMode.PAY_PER_REQUEST)

        datalake_bucket = s3.Bucket(self,
                                    'datalakebucket',
                                    bucket_name='datalake-bucket-cdk-oregon')

        glue_role = _iam.Role(
            self,
            'gluerole',
            assumed_by=_iam.ServicePrincipal('glue.amazonaws.com'),
            managed_policies=[
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    'service-role/AWSGlueServiceRole')
            ])

        raw_bucket.grant_read(glue_role)
        datalake_bucket.grant_read_write(glue_role)

        #lake formation settings
        #If you have attached managed policy ('AWSLakeFormationDataAdmin') to your own iam user, you should change that policy to allow "lakeformation:PutDataLakeSettings",
        #so that the lake setting can be allowed by below code in cdk.
        lake_admin_setting = _lakeformation.CfnDataLakeSettings(
            self,
            'data-lake-GrantAdmin',
            admins=[
                _lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty(
                    data_lake_principal_identifier=configuration[
                        'executiveArn'])
            ])

        glue_database = _glue.Database(self,
                                       'gluedatabase',
                                       database_name='data_lake_gluedb')

        glue_database.node.add_dependency(lake_admin_setting)

        glue_role_permission_inLakeFormation = _lakeformation.CfnPermissions(
            self,
            'permission-glueRole',
            data_lake_principal=_lakeformation.CfnPermissions.
            DataLakePrincipalProperty(
                data_lake_principal_identifier=glue_role.role_arn),
            resource=_lakeformation.CfnPermissions.ResourceProperty(
                database_resource=_lakeformation.CfnPermissions.
                DatabaseResourceProperty(name=glue_database.database_name)),
            permissions=['ALL'])

        crawler = _glue.CfnCrawler(
            self,
            'datalakecrawler',
            name='Crawler-datalake-cdk',
            role=glue_role.role_arn,
            targets={
                's3Targets': [{
                    'path':
                    's3://' + datalake_bucket.bucket_name + '/datalake/'
                }]
            },
            database_name='data_lake_gluedb',
            configuration=
            "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"},\"Tables\":{\"AddOrUpdateBehavior\":\"MergeNewColumns\"}}}"
        )

        initialload_script = S3Assets.Asset(self,
                                            'initial-load-code',
                                            path='./Gluejob/InitialLoad.py')
        incrementalload_script = S3Assets.Asset(
            self, 'incremental-load-code', path='./Gluejob/IncrementalLoad.py')

        initialload_script.grant_read(glue_role)
        incrementalload_script.grant_read(glue_role)
        my_table.grant_full_access(glue_role)

        initial_load_job = _glue.CfnJob(
            self,
            'initial-job',
            name='InitialLoad-cdk',
            command=_glue.CfnJob.JobCommandProperty(
                name='glueetl',
                python_version='3',
                script_location='s3://' + initialload_script.s3_bucket_name +
                '/' + initialload_script.s3_object_key),
            role=glue_role.role_arn,
            default_arguments={
                '--prefix': str(configuration['tableList']),
                '--bucket': raw_bucket.bucket_name,
                '--datalake_bucket': datalake_bucket.bucket_name,
                '--datalake_prefix': 'datalake/',
                '--region': CdkpyStack.of(self).region,
                '--controller_table_name': my_table.table_name
            },
            allocated_capacity=configuration['glue_job_setting']
            ['job_capacity'],
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=configuration['glue_job_setting']
                ['max_concurrent_run_JobExecution']))

        incremental_load_job = _glue.CfnJob(
            self,
            'increment-job',
            name='IncrementalLoad-cdk',
            command=_glue.CfnJob.JobCommandProperty(
                name='glueetl',
                script_location='s3://' +
                incrementalload_script.s3_bucket_name + '/' +
                incrementalload_script.s3_object_key,
                python_version='3'),
            role=glue_role.role_arn,
            default_arguments={
                '--prefix': str(configuration['tableList']),
                '--bucket': raw_bucket.bucket_name,
                '--datalake_bucket': datalake_bucket.bucket_name,
                '--datalake_prefix': 'datalake/',
                '--region': CdkpyStack.of(self).region,
                '--controller_table_name': my_table.table_name
            },
            allocated_capacity=2,
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=1))

        job_trigger = _glue.CfnTrigger(
            self,
            'datalake-glue-trigger',
            type='SCHEDULED',
            schedule=configuration['job_trigger_schedule'],
            start_on_creation=False,
            actions=[
                _glue.CfnTrigger.ActionProperty(job_name='IncrementalLoad-cdk')
            ])

        dl_sns = _sns.Topic(self, 'datalake_sns', display_name='data-lake-sns')

        endpoint_email = configuration['emailSubscriptionList']

        for emails in endpoint_email:
            dl_sns.add_subscription(_subscrption.EmailSubscription(emails))

        #Another way to subscribe: dl_subscription = _sns.Subscription(self,'email-subscrption',topic = dl_sns,endpoint='*****@*****.**',protocol= _sns.SubscriptionProtocol.EMAIL)

        glue_events_target = _events_targets.SnsTopic(dl_sns)

        glue_events_rule = _events.Rule(
            self,
            'gluejobevents-datalake',
            description='Using for tracking the failed glue job of data lake',
            rule_name='dl-gluejob-event',
            event_pattern=_events.EventPattern(
                source=['aws.glue'],
                detail_type=['Glue Job State Change'],
                detail={
                    "jobName": [initial_load_job.name],
                    "state": ["FAILED"]
                }),
            targets=[glue_events_target])

        dms_subscription = _dms.CfnEventSubscription(
            self,
            'dmsevents-datalake',
            sns_topic_arn=dl_sns.topic_arn,
            subscription_name='datalake-dmsevents',
            source_type='replication-task',
            event_categories=['failure'])
    def __init__(
        self,
        scope: cdk.Construct,
        construct_id: str,
        stack_log_level: str,
        glue_db_name: str,
        glue_table_name: str,
        etl_bkt,
        src_stream,
        **kwargs,
    ) -> None:
        super().__init__(scope, construct_id, **kwargs)

        self.template_options.metadata = {"License": "Miztiik Corp."}

        # Glue Job IAM Role
        self._glue_etl_role = _iam.Role(
            self,
            "glueJobRole",
            assumed_by=_iam.ServicePrincipal("glue.amazonaws.com"),
            managed_policies=[
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    "AmazonS3ReadOnlyAccess"),
                _iam.ManagedPolicy.from_aws_managed_policy_name(
                    "service-role/AWSGlueServiceRole")
            ])
        self._glue_etl_role.add_to_policy(
            _iam.PolicyStatement(
                actions=["s3:*"],
                resources=[f"{etl_bkt.bucket_arn}",
                           f"{etl_bkt.bucket_arn}/*"]))

        self._glue_etl_role.add_to_policy(
            _iam.PolicyStatement(actions=["kinesis:DescribeStream"],
                                 resources=[f"{src_stream.stream_arn}"]))

        src_stream.grant_read(self._glue_etl_role)

        # Create the Glue job to convert incoming JSON to parquet
        # Read GlueSpark Code
        try:
            with open(
                    "stacks/back_end/glue_stacks/glue_job_scripts/kinesis_streams_batch_to_s3_etl.py",
                    encoding="utf-8",
                    mode="r",
            ) as f:
                kinesis_streams_batch_to_s3_etl = f.read()
        except OSError:
            print("Unable to read Glue Job Code")
            raise

        etl_script_asset = _s3_assets.Asset(
            self,
            "etlScriptAsset",
            path=
            "stacks/back_end/glue_stacks/glue_job_scripts/kinesis_streams_batch_to_s3_etl.py"
        )

        self.etl_prefix = "stream-etl"
        _glue_etl_job = _glue.CfnJob(
            self,
            "glueJsonToParquetJob",
            name="stream-etl-processor",
            description=
            "Glue Job to process stream of events from Kinesis data stream and store them in parquet format in S3",
            role=self._glue_etl_role.role_arn,
            glue_version="2.0",
            command=_glue.CfnJob.JobCommandProperty(
                name="gluestreaming",
                script_location=
                f"s3://{etl_script_asset.s3_bucket_name}/{etl_script_asset.s3_object_key}",
                python_version="3"),
            default_arguments={
                "--src_db_name": glue_db_name,
                "--src_tbl_name": glue_table_name,
                "--datalake_bkt_name": etl_bkt.bucket_name,
                "--datalake_bkt_prefix": f"{self.etl_prefix}/",
                "--job-bookmark-option": "job-bookmark-enable"
            },
            allocated_capacity=1,
            # timeout=2,
            max_retries=2,
            execution_property=_glue.CfnJob.ExecutionPropertyProperty(
                max_concurrent_runs=1))

        # Configure a Trigger - Every hour
        _glue_etl_job_trigger = _glue.CfnTrigger(
            self,
            "glueEtlJobtrigger",
            type="SCHEDULED",
            description=
            "Miztiik Automation: Trigger streaming etl glue job every hour",
            schedule="cron(0 1 * * ? *)",
            start_on_creation=False,
            actions=[
                _glue.CfnTrigger.ActionProperty(
                    job_name=f"{_glue_etl_job.name}", timeout=2)
            ])
        _glue_etl_job_trigger.add_depends_on(_glue_etl_job)

        ###########################################
        ################# OUTPUTS #################
        ###########################################
        output_0 = cdk.CfnOutput(
            self,
            "AutomationFrom",
            value=f"{GlobalArgs.SOURCE_INFO}",
            description=
            "To know more about this automation stack, check out our github page.",
        )

        output_1 = cdk.CfnOutput(
            self,
            "StreamingETLGlueJob",
            value=
            f"https://console.aws.amazon.com/gluestudio/home?region={cdk.Aws.REGION}#/jobs",
            description="Glue ETL Job.",
        )