Ejemplo n.º 1
0
    def __init__(self, scope: core.Construct, data_lake: DataLake,
                 **kwargs) -> None:
        self.env = data_lake.env.value
        super().__init__(scope, id=f'{self.env}-glue-catalog', **kwargs)

        self.atomic_events_crawler = glue.CfnCrawler(
            self,
            f'{self.env}-atomic-events-crawler',
            name=f'{self.env}-atomic-events-crawler',
            description=
            'Crawler to detect schema of data sored in data lake raw, atomic events',
            schedule=glue.CfnCrawler.ScheduleProperty(
                schedule_expression='cron(0/15 * * * ? *)'),
            role=data_lake.data_lake_role.role_arn,
            targets=glue.CfnCrawler.TargetsProperty(s3_targets=[
                glue.CfnCrawler.S3TargetProperty(
                    path=
                    f's3://{data_lake.data_lake_raw_bucket.bucket_name}/atomic_events'
                )
            ]),
            database_name=data_lake.data_lake_raw_database.database_name)

        self.orders_table = glue.Table(
            self,
            f'{self.env}-orders-table',
            table_name='orders',
            description='orders captured from Postgres using DMS CDC',
            database=data_lake.data_lake_raw_database,
            compressed=True,
            data_format=glue.DataFormat(
                input_format=glue.InputFormat.TEXT,
                output_format=glue.OutputFormat.HIVE_IGNORE_KEY_TEXT,
                serialization_library=glue.SerializationLibrary.OPEN_CSV),
            s3_prefix='orders',
            bucket=data_lake.data_lake_raw_bucket,
            columns=[
                glue.Column(name='created_at',
                            type=glue.Type(input_string='datetime',
                                           is_primitive=True)),
                glue.Column(name='order_id',
                            type=glue.Type(input_string='integer',
                                           is_primitive=True)),
                glue.Column(name='product_name',
                            type=glue.Type(input_string='string',
                                           is_primitive=True)),
                glue.Column(name='value',
                            type=glue.Type(input_string='float',
                                           is_primitive=True))
            ])
Ejemplo n.º 2
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        self._region = 'aws_region'
        self._account_id = 'aws_account_id'

        bucket = s3.Bucket.from_bucket_name(self, 'my_bucket_id', 'my_bucket')

        database = glue.Database(self,
                                 id='my_database_id',
                                 database_name='poc')

        table = glue.Table(
            self,
            id='my_table_id',
            database=database,
            table_name='my_table',
            columns=[
                glue.Column(name='col1',
                            type=glue.Type(input_string='string',
                                           is_primitive=True)),
                glue.Column(name='col2',
                            type=glue.Type(input_string='int',
                                           is_primitive=True))
            ],
            partition_keys=[
                glue.Column(name='dt',
                            type=glue.Type(input_string='string',
                                           is_primitive=True))
            ],
            bucket=bucket,
            s3_prefix='test_data',
            data_format=glue.DataFormat(
                input_format=glue.InputFormat(
                    'org.apache.hadoop.mapred.TextInputFormat'),
                output_format=glue.OutputFormat(
                    'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
                ),
                serialization_library=glue.SerializationLibrary(
                    'org.openx.data.jsonserde.JsonSerDe')))
 def __init__(
     self,
     scope: core.Construct,
     glue_database: BaseDataLakeGlueDatabase,
     glue_role: BaseDataLakeGlueRole,
     **kwargs,
 ) -> None:
     self.glue_role = glue_role
     self.glue_database = glue_database
     self.deploy_env = self.glue_database.deploy_env
     self.data_lake_bucket = self.glue_database.data_lake_bucket
     self.obj_name = f"glue-{self.deploy_env.value}-orders-table"
     super().__init__(
         scope,
         self.obj_name,
         table_name="orders",
         description="orders captured from Postgres using DMS CDC",
         database=self.glue_database,
         compressed=True,
         data_format=glue.DataFormat.PARQUET,
         s3_prefix="orders/public/orders",
         bucket=self.data_lake_bucket,
         columns=[
             glue.Column(name="op",
                         type=glue.Type(input_string="string",
                                        is_primitive=True)),
             glue.Column(
                 name="extracted_at",
                 type=glue.Type(input_string="string", is_primitive=True),
             ),
             glue.Column(
                 name="created_at",
                 type=glue.Type(input_string="timestamp",
                                is_primitive=True),
             ),
             glue.Column(name="order_id",
                         type=glue.Type(input_string="int",
                                        is_primitive=True)),
             glue.Column(
                 name="product_name",
                 type=glue.Type(input_string="string", is_primitive=True),
             ),
             glue.Column(name="value",
                         type=glue.Type(input_string="double",
                                        is_primitive=True)),
         ],
         **kwargs,
     )
Ejemplo n.º 4
0
def glue_column(name, col_type, is_primitive=True):
    return glue.Column(name=name,
                       type=glue.Type(input_string=col_type,
                                      is_primitive=is_primitive))
Ejemplo n.º 5
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        s3_logs_bucket = s3.Bucket(
            self,
            "LogsBucket",
            encryption=s3.BucketEncryption.KMS_MANAGED,
            block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
            lifecycle_rules=[
                s3.LifecycleRule(
                    abort_incomplete_multipart_upload_after=core.Duration.days(
                        7),
                    expiration=core.Duration.days(30))
            ])

        s3_data_bucket = s3.Bucket(
            self,
            "DataBucket",
            encryption=s3.BucketEncryption.KMS_MANAGED,
            block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
            server_access_logs_bucket=s3_logs_bucket,
            server_access_logs_prefix=f"s3accesslogs/{PROJECT_NAME}/")

        glue_database = glue.Database(self,
                                      "GlueDatabase",
                                      database_name=PROJECT_NAME)

        glue_table = glue.Table(
            self,
            "GlueTable",
            columns=[
                glue.Column(name="timestamp",
                            type=glue.Type(input_string="int",
                                           is_primitive=True)),
                glue.Column(name="celcius",
                            type=glue.Type(input_string="double",
                                           is_primitive=True)),
                glue.Column(name="fahrenheit",
                            type=glue.Type(input_string="double",
                                           is_primitive=True))
            ],
            database=glue_database,
            data_format=glue.DataFormat(
                input_format=glue.InputFormat(
                    "org.apache.hadoop.mapred.TextInputFormat"),
                output_format=glue.OutputFormat(
                    "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
                ),
                serialization_library=glue.SerializationLibrary(
                    "org.openx.data.jsonserde.JsonSerDe")),
            table_name=PROJECT_NAME,
            encryption=glue.TableEncryption.S3_MANAGED,
            partition_keys=[
                glue.Column(name="year",
                            type=glue.Type(input_string="int",
                                           is_primitive=True)),
                glue.Column(name="month",
                            type=glue.Type(input_string="int",
                                           is_primitive=True)),
                glue.Column(name="day",
                            type=glue.Type(input_string="int",
                                           is_primitive=True))
            ])

        glue_crawler_role = iam.Role(
            self,
            "GlueCrawlerRole",
            assumed_by=iam.ServicePrincipal("glue.amazonaws.com"),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    "AWSGlueServiceRole")
            ])

        s3_data_bucket.grant_read(glue_crawler_role,
                                  objects_key_pattern=f"{PROJECT_PREFIX}/")
        s3_data_bucket.grant_put(glue_crawler_role,
                                 objects_key_pattern=f"{PROJECT_PREFIX}/")

        glue_crawler = glue.CfnCrawler(
            self,
            "GlueCrawler",
            role=glue_crawler_role.role_arn,
            database_name=glue_database.database_name,
            targets={
                "s3Targets": [{
                    "path":
                    f"{s3_data_bucket.bucket_name}/{PROJECT_PREFIX}/"
                }]
            },
            schedule={"scheduleExpression": "cron(30 04 * * ? *)"})
Ejemplo n.º 6
0
    def __init__(self, scope: core.Construct, id: str, config_dict,
                 **kwargs) -> None:
        super().__init__(scope, id, **kwargs)
        """ Create the datalake database """
        createDatalakeDB = glue.Database(
            self,
            "createDatalakeDB",
            database_name=config_dict['datalake_db_name'])

        core.CfnOutput(self,
                       "createDatalakeDBName",
                       value=createDatalakeDB.database_name)
        """ Create Comp Reg Table """

        createDatalakeCompRegTable = glue.Table(
            self,
            "createDatalakeCompRegTable",
            columns=[
                glue.Column(name="lot_compound_id",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="version_id",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_id",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="smiles",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_mw",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="salt_multiplicity",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="salt_name",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="formula_weight",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_alias",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="stereochemistry",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="stereocomment",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="geometric_isomerism",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_comment",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="parent_project",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="elnref",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="msmethod",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="msmass",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="provider",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="purity",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="puritymethod",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="nmrshifts",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="lotalias",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="lot_comment",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="lot_project",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="molfile",
                            type=glue.Type(input_string="string",
                                           is_primitive=True)),
                glue.Column(name="checksum",
                            type=glue.Type(input_string="string",
                                           is_primitive=True))
            ],
            database=createDatalakeDB.from_database_arn(
                self, "GetDBArn", database_arn=createDatalakeDB.database_arn),
            data_format=glue.DataFormat(
                input_format=glue.InputFormat.PARQUET,
                output_format=glue.OutputFormat.PARQUET,
                serialization_library=glue.SerializationLibrary.PARQUET),
            table_name="tbl_compound_data",
            bucket=s3.Bucket.from_bucket_name(
                self,
                "getIBucket",
                bucket_name=config_dict['datalake_bucket_name']),
            compressed=True,
            description=
            "This table contains data regarding compound registration coming from  RDS",
            partition_keys=[
                glue.Column(name="dt",
                            type=glue.Type(input_string="string",
                                           is_primitive=True))
            ],
            s3_prefix="compound_reg/compound_data/")

        core.CfnOutput(self,
                       "createDatalakeCompRegTableName",
                       value=createDatalakeCompRegTable.table_name)
Ejemplo n.º 7
0
    def __init__(self, scope: core.Construct, app: PMIApp, cid: str, *,
                 partner: str, bucket: s3.IBucket, database: glue.IDatabase):
        super().__init__(
            scope,
            cid,
            database=database,
            table_name=partner,
            description=f"Aggregated viewability metrics for {partner}.",
            columns=[
                glue.Column(name='hit_date', type=glue.Schema.DATE),
                glue.Column(name='measurement_source_id',
                            type=glue.Schema.INTEGER),
                glue.Column(name='partner_measured_advertiser_id',
                            type=glue.Schema.BIG_INT),
                glue.Column(name='partner_measured_campaign_id',
                            type=glue.Schema.BIG_INT),
                glue.Column(name='partner_measured_channel_id',
                            type=glue.Schema.BIG_INT),
                glue.Column(name='partner_measured_placement_id',
                            type=glue.Schema.BIG_INT),
                glue.Column(name='partner_measured_creative_id',
                            type=glue.Schema.BIG_INT),
                glue.Column(name='media_type_id', type=glue.Schema.INTEGER),
                glue.Column(name='below_the_fold_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='on_the_fold_imps', type=glue.Schema.INTEGER),
                glue.Column(name='above_the_fold_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='time_on_page', type=glue.Schema.BIG_INT),
                glue.Column(name='in_view_time', type=glue.Schema.BIG_INT),
                glue.Column(name='in_view_imps', type=glue.Schema.INTEGER),
                glue.Column(name='in_view_5s_imps', type=glue.Schema.INTEGER),
                glue.Column(name='in_view_15s_imps', type=glue.Schema.INTEGER),
                glue.Column(name='not_in_view_imps', type=glue.Schema.INTEGER),
                glue.Column(name='never_in_view_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_load_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_unload_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='completed_1q_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_1q_imps', type=glue.Schema.INTEGER),
                glue.Column(name='completed_2q_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_2q_imps', type=glue.Schema.INTEGER),
                glue.Column(name='completed_3q_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_3q_imps', type=glue.Schema.INTEGER),
                glue.Column(name='completed_4q_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_4q_imps', type=glue.Schema.INTEGER),
                glue.Column(name='never_started_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='muted_imps', type=glue.Schema.INTEGER),
                glue.Column(name='full_screen_imps', type=glue.Schema.INTEGER),
                glue.Column(name='click_through_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='sivt_in_view_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='sivt_not_in_view_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_time_on_page',
                            type=glue.Schema.BIG_INT),
                glue.Column(name='groupm_in_view_time',
                            type=glue.Schema.BIG_INT),
                glue.Column(name='groupm_in_view_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_in_view_5s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_in_view_15s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_not_in_view_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_never_in_view_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_in_view_load_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_in_view_unload_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_completed_1q_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_in_view_1q_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_completed_2q_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_in_view_2q_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_completed_3q_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_in_view_3q_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_completed_4q_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_in_view_4q_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_never_started_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_muted_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_full_screen_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_click_through_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_sivt_in_view_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_sivt_not_in_view_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='suspicious_imps', type=glue.Schema.INTEGER),
                glue.Column(name='measured_imps', type=glue.Schema.INTEGER),
                glue.Column(name='groupm_measured_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='general_invalid_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='viewability_measurement_trusted_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='imps', type=glue.Schema.INTEGER),
                glue.Column(name='sitting_duck_bot_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='standard_bot_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='volunteer_bot_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='profile_bot_imps', type=glue.Schema.INTEGER),
                glue.Column(name='masked_bot_imps', type=glue.Schema.INTEGER),
                glue.Column(name='nomadic_bot_imps', type=glue.Schema.INTEGER),
                glue.Column(name='other_bot_imps', type=glue.Schema.INTEGER),
                glue.Column(name='true_view_viewable_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='true_view_measurable_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='yahoo_gemini_billable_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='full_ad_in_view_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='pm_platform', type=glue.Schema.STRING),
                glue.Column(name='publicis_in_view_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='yahoo_gemini_billable_suspicious_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='average_in_view_time',
                            type=glue.Schema.DOUBLE),
                glue.Column(name='in_view_lt_1s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_1s_2s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_2s_5s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_5s_10s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_10s_15s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_15s_20s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_20s_25s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_25s_30s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_30s_35s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_35s_40s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_40s_45s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_45s_50s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='in_view_ge_50s_imps',
                            type=glue.Schema.INTEGER),
                glue.Column(name='viewability_measured_or_fraud_ads',
                            type=glue.Schema.INTEGER)
            ],
            partition_keys=[
                glue.Column(name='estdate', type=glue.Schema.STRING)
            ],
            data_format=glue.DataFormat.TSV,
            bucket=bucket,
            s3_prefix=core.Fn.join('', [
                core.Fn.import_value(
                    f'{SHARED_RESOURCES_STACK_NAME_BASE}-{app.env_id}:DataLakeMartBucketDataPrefix'
                ), partner, '/'
            ]),
            compressed=True)

        # serialization properties
        cfn_table: glue.CfnTable = self.node.default_child
        cfn_table.add_property_override(
            'TableInput.StorageDescriptor.SerdeInfo.Parameters', {
                'field.delim': '\t',
                'serialization.null.format': '\\N'
            })

        # data retention period
        cfn_table.add_property_override('TableInput.Retention', 365)
    def __init__(self, scope: core.Construct, construct_id: str,
                 **kwargs) -> None:
        super().__init__(scope, construct_id, **kwargs)
        # create db for glue schema
        glue_db = glue.Database(
            self,
            'GlueDB',
            database_name='reddit_data',
        )

        # data schema
        glue_table = glue.Table(
            self,
            'GlueTable',
            table_name='sentiment',
            columns=[
                glue.Column(name='@timestamp', type=glue.Schema.TIMESTAMP),
                glue.Column(name='id', type=glue.Schema.STRING),
                glue.Column(name='subreddit', type=glue.Schema.STRING),
                glue.Column(name='body', type=glue.Schema.STRING),
                glue.Column(name='is_submitter', type=glue.Schema.BOOLEAN),
                glue.Column(name='polarity', type=glue.Schema.FLOAT),
                glue.Column(name='subjectivity', type=glue.Schema.FLOAT),
                glue.Column(name='author', type=glue.Schema.STRING),
            ],
            database=glue_db,
            data_format=glue.DataFormat.PARQUET,
            bucket=s3.Bucket.from_bucket_arn(self, 'DataBucket', BUCKET_ARN),
            s3_prefix='reddit/',
        )

        # role assumed by firehose
        stream_role = iam.Role(
            self,
            'FirehoseRole',
            assumed_by=iam.ServicePrincipal('firehose.amazonaws.com'),
            description='role used by Firehose to access s3 bucket',
        )

        # add s3 statement
        stream_role.add_to_policy(
            iam.PolicyStatement(
                resources=[BUCKET_ARN, f'{BUCKET_ARN}/*'],
                actions=[
                    's3:AbortMultipartUpload',
                    's3:GetBucketLocation',
                    's3:GetObject',
                    's3:ListBucket',
                    's3:ListBucketMultipartUploads',
                    's3:PutObject',
                ],
            ))

        # add glue statement
        stream_role.add_to_policy(
            iam.PolicyStatement(
                resources=[
                    glue_table.table_arn,
                    glue_db.database_arn,
                    glue_db.catalog_arn,
                ],
                actions=[
                    'glue:GetTable',
                    'glue:GetTableVersion',
                    'glue:GetTableVersions',
                ],
            ))

        # cloudwatch statement
        stream_role.add_to_policy(
            iam.PolicyStatement(
                resources=['*'],
                actions=[
                    'logs:PutLogEvents',
                ],
            ))

        data_format_conversion_configuration = kf.CfnDeliveryStream.DataFormatConversionConfigurationProperty(
            enabled=True,
            input_format_configuration=kf.CfnDeliveryStream.
            InputFormatConfigurationProperty(
                deserializer=kf.CfnDeliveryStream.DeserializerProperty(
                    hive_json_ser_de=kf.CfnDeliveryStream.
                    HiveJsonSerDeProperty(), ), ),
            output_format_configuration=kf.CfnDeliveryStream.
            OutputFormatConfigurationProperty(
                serializer=kf.CfnDeliveryStream.SerializerProperty(
                    parquet_ser_de=kf.CfnDeliveryStream.ParquetSerDeProperty(),
                ), ),
            schema_configuration=kf.CfnDeliveryStream.
            SchemaConfigurationProperty(
                database_name=glue_db.database_name,
                table_name=glue_table.table_name,
                role_arn=stream_role.role_arn,
                region='us-east-2',
            ),
        )

        s3_config = kf.CfnDeliveryStream.ExtendedS3DestinationConfigurationProperty(
            bucket_arn=BUCKET_ARN,  # temporary, will replace with env variable
            role_arn=stream_role.role_arn,
            data_format_conversion_configuration=
            data_format_conversion_configuration,
            prefix='reddit/',
            buffering_hints=kf.CfnDeliveryStream.BufferingHintsProperty(
                size_in_m_bs=64, ),
        )

        firehose = kf.CfnDeliveryStream(
            self,
            'FirehoseStream',
            delivery_stream_name='RedditDataStream',
            extended_s3_destination_configuration=s3_config,
        )

        # add role dependency
        firehose.node.add_dependency(stream_role)

        # add ECS Fargate instance
        app_role = iam.Role(
            self,
            'RedditStreamingAppRole',
            assumed_by=iam.ServicePrincipal('ecs-tasks.amazonaws.com'),
            description=
            'Role used by the Reddit Streaming Application Fargate Task',
        )

        # add firehose permissions
        app_role.add_to_policy(
            iam.PolicyStatement(
                resources=[firehose.attr_arn],
                actions=[
                    'firehose:DeleteDeliveryStream',
                    'firehose:PutRecord',
                    'firehose:PutRecordBatch',
                    'firehose:UpdateDestination',
                ],
            ))

        # add ecs and cloudwatch permissions
        app_role.add_to_policy(
            iam.PolicyStatement(
                resources=['*'],
                actions=[
                    'ecr:GetAuthorizationToken',
                    'ecr:BatchCheckLayerAvailability',
                    'ecr:GetDownloadUrlForLayer',
                    'ecr:BatchGetImage',
                    'logs:CreateLogStream',
                    'logs:PutLogEvents',
                ],
            ))

        vpc = ec2.Vpc(self, 'RedditVpc', max_azs=3)

        cluster = ecs.Cluster(self, 'RedditCluster', vpc=vpc)

        task_definition = ecs.FargateTaskDefinition(
            self,
            'TaskDefinition',
            memory_limit_mib=512,
            cpu=256,
            task_role=app_role,
        )

        task_definition.add_container(
            id='RedditStreamingApp',
            image=ecs.ContainerImage.from_asset('./sentiment_analysis'),
            command=['all'],
            environment={
                'FIREHOSE_STREAM_NAME': firehose.delivery_stream_name,
                'PRAW_CLIENT_SECRET': os.environ['PRAW_CLIENT_SECRET'],
                'PRAW_CLIENT_ID': os.environ['PRAW_CLIENT_ID'],
                'PRAW_USER_AGENT': os.environ['PRAW_USER_AGENT'],
            },
            logging=ecs.LogDriver.aws_logs(stream_prefix='reddit'),
        )

        container = ecs.FargateService(
            self,
            'StreamingApplication',
            desired_count=1,
            task_definition=task_definition,
            cluster=cluster,
            assign_public_ip=True,
        )
Ejemplo n.º 9
0
    def __init__(self, scope: core.Construct, app: PMIApp, cid: str, *,
                 partner: str, bucket: s3.IBucket, database: glue.IDatabase):
        super().__init__(
            scope,
            cid,
            database=database,
            table_name=partner,
            description=f"Ad sessions (JAS) for {partner}.",
            columns=[
                glue.Column(name='impression_id', type=glue.Schema.STRING),
                glue.Column(name='site', type=glue.Schema.STRING),
                glue.Column(name='measurement_source_id',
                            type=glue.Schema.INTEGER),
                glue.Column(name='partner_measured_advertiser_id',
                            type=glue.Schema.BIG_INT),
                glue.Column(name='partner_measured_campaign_id',
                            type=glue.Schema.BIG_INT),
                glue.Column(name='partner_measured_channel_id',
                            type=glue.Schema.BIG_INT),
                glue.Column(name='partner_measured_placement_id',
                            type=glue.Schema.BIG_INT),
                glue.Column(name='partner_measured_creative_id',
                            type=glue.Schema.BIG_INT),
                glue.Column(name='media_type_id', type=glue.Schema.INTEGER),
                glue.Column(name='below_the_fold', type=glue.Schema.BOOLEAN),
                glue.Column(name='on_the_fold', type=glue.Schema.BOOLEAN),
                glue.Column(name='above_the_fold', type=glue.Schema.BOOLEAN),
                glue.Column(name='time_on_page', type=glue.Schema.INTEGER),
                glue.Column(name='in_view_time', type=glue.Schema.INTEGER),
                glue.Column(name='in_view', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_5s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_15s', type=glue.Schema.BOOLEAN),
                glue.Column(name='not_in_view', type=glue.Schema.BOOLEAN),
                glue.Column(name='never_in_view', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_load', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_unload', type=glue.Schema.BOOLEAN),
                glue.Column(name='completed_1q', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_1q', type=glue.Schema.BOOLEAN),
                glue.Column(name='completed_2q', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_2q', type=glue.Schema.BOOLEAN),
                glue.Column(name='completed_3q', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_3q', type=glue.Schema.BOOLEAN),
                glue.Column(name='completed_4q', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_4q', type=glue.Schema.BOOLEAN),
                glue.Column(name='never_started', type=glue.Schema.BOOLEAN),
                glue.Column(name='muted', type=glue.Schema.BOOLEAN),
                glue.Column(name='full_screen', type=glue.Schema.BOOLEAN),
                glue.Column(name='click_through', type=glue.Schema.BOOLEAN),
                glue.Column(name='sivt_in_view', type=glue.Schema.BOOLEAN),
                glue.Column(name='sivt_not_in_view', type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_time_on_page',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_in_view_time',
                            type=glue.Schema.INTEGER),
                glue.Column(name='groupm_in_view', type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_in_view_5s',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_in_view_15s',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_not_in_view',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_never_in_view',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_in_view_load',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_in_view_unload',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_completed_1q',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_in_view_1q',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_completed_2q',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_in_view_2q',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_completed_3q',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_in_view_3q',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_completed_4q',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_in_view_4q',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_never_started',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_muted', type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_full_screen',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_click_through',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_sivt_in_view',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_sivt_not_in_view',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='suspicious', type=glue.Schema.BOOLEAN),
                glue.Column(name='measured', type=glue.Schema.BOOLEAN),
                glue.Column(name='groupm_measured', type=glue.Schema.BOOLEAN),
                glue.Column(name='general_invalid', type=glue.Schema.BOOLEAN),
                glue.Column(name='viewability_measurement_trusted',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='sitting_duck_bot', type=glue.Schema.BOOLEAN),
                glue.Column(name='standard_bot', type=glue.Schema.BOOLEAN),
                glue.Column(name='volunteer_bot', type=glue.Schema.BOOLEAN),
                glue.Column(name='profile_bot', type=glue.Schema.BOOLEAN),
                glue.Column(name='masked_bot', type=glue.Schema.BOOLEAN),
                glue.Column(name='nomadic_bot', type=glue.Schema.BOOLEAN),
                glue.Column(name='other_bot', type=glue.Schema.BOOLEAN),
                glue.Column(name='true_view_viewable',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='true_view_measurable',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='yahoo_gemini_billable',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='full_ad_in_view', type=glue.Schema.BOOLEAN),
                glue.Column(name='publicis_in_view', type=glue.Schema.BOOLEAN),
                glue.Column(name='yahoo_gemini_billable_suspicious',
                            type=glue.Schema.BOOLEAN),
                glue.Column(name='average_in_view_time',
                            type=glue.Schema.DOUBLE),
                glue.Column(name='in_view_lt_1s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_1s_2s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_2s_5s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_5s_10s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_10s_15s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_15s_20s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_20s_25s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_25s_30s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_30s_35s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_35s_40s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_40s_45s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_45s_50s', type=glue.Schema.BOOLEAN),
                glue.Column(name='in_view_ge_50s', type=glue.Schema.BOOLEAN),
                glue.Column(name='viewability_measured_or_fraud',
                            type=glue.Schema.BOOLEAN)
            ],
            partition_keys=[
                glue.Column(name='estdate', type=glue.Schema.STRING),
                glue.Column(name='esthour', type=glue.Schema.STRING)
            ],
            data_format=glue.DataFormat.AVRO,
            bucket=bucket,
            s3_prefix=core.Fn.join('', [
                core.Fn.import_value(
                    f'{SHARED_RESOURCES_STACK_NAME_BASE}-{app.env_id}:DataLakeJASBucketDataPrefix'
                ), partner, '/'
            ]))

        # data retention period
        cfn_table: glue.CfnTable = self.node.default_child
        cfn_table.add_property_override('TableInput.Retention', 90)