Example #1
0
def create_dynamodb_table():
    client = DynamoDBHook(client_type='dynamodb').conn
    client.create_table(
        TableName=DYNAMODB_TABLE_NAME,
        KeySchema=[
            {
                'AttributeName': DYNAMODB_TABLE_HASH_KEY,
                'KeyType': 'HASH'
            },
        ],
        AttributeDefinitions=[
            {
                'AttributeName': DYNAMODB_TABLE_HASH_KEY,
                'AttributeType': 'N'
            },
        ],
        ProvisionedThroughput={
            'ReadCapacityUnits': 20,
            'WriteCapacityUnits': 20
        },
    )

    # DynamoDB table creation is nearly, but not quite, instantaneous.
    # Wait for the table to be active to avoid race conditions writing to it.
    waiter = client.get_waiter('table_exists')
    waiter.wait(TableName=DYNAMODB_TABLE_NAME, WaiterConfig={'Delay': 1})
def get_dynamodb_item_count():
    """
    A DynamoDB table has an ItemCount value, but it is only updated every six hours.
    To verify this DAG worked, we will scan the table and count the items manually.
    """
    table = DynamoDBHook(resource_type='dynamodb').conn.Table(DYNAMODB_TABLE_NAME)

    response = table.scan(Select='COUNT')
    item_count = response['Count']

    while 'LastEvaluatedKey' in response:
        response = table.scan(Select='COUNT', ExclusiveStartKey=response['LastEvaluatedKey'])
        item_count += response['Count']

    print(f'DynamoDB table contains {item_count} items.')
Example #3
0
    def execute(self, context: 'Context') -> None:
        hook = DynamoDBHook(aws_conn_id=self.aws_conn_id)
        table = hook.get_conn().Table(self.dynamodb_table_name)

        scan_kwargs = copy(self.dynamodb_scan_kwargs) if self.dynamodb_scan_kwargs else {}
        err = None
        f: IO[Any]
        with NamedTemporaryFile() as f:
            try:
                f = self._scan_dynamodb_and_upload_to_s3(f, scan_kwargs, table)
            except Exception as e:
                err = e
                raise e
            finally:
                if err is None:
                    _upload_file_to_s3(f, self.s3_bucket_name, self.s3_key_prefix, self.aws_conn_id)
Example #4
0
    def execute(self, context: 'Context'):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)

        self.log.info('Extracting data from Hive')
        self.log.info(self.sql)

        data = hive.get_pandas_df(self.sql, schema=self.schema)
        dynamodb = DynamoDBHook(
            aws_conn_id=self.aws_conn_id,
            table_name=self.table_name,
            table_keys=self.table_keys,
            region_name=self.region_name,
        )

        self.log.info('Inserting rows into dynamodb')

        if self.pre_process is None:
            dynamodb.write_batch_data(
                json.loads(data.to_json(orient='records')))
        else:
            dynamodb.write_batch_data(
                self.pre_process(data=data,
                                 args=self.pre_process_args,
                                 kwargs=self.pre_process_kwargs))

        self.log.info('Done.')
Example #5
0
def delete_dynamodb_table():
    DynamoDBHook(client_type='dynamodb').conn.delete_table(
        TableName=DYNAMODB_TABLE_NAME)