Ejemplo n.º 1
0
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)

        self.log.info('Extracting data from Hive')
        self.log.info(self.sql)

        data = hive.get_pandas_df(self.sql, schema=self.schema)
        dynamodb = AwsDynamoDBHook(
            aws_conn_id=self.aws_conn_id,
            table_name=self.table_name,
            table_keys=self.table_keys,
            region_name=self.region_name,
        )

        self.log.info('Inserting rows into dynamodb')

        if self.pre_process is None:
            dynamodb.write_batch_data(
                json.loads(data.to_json(orient='records')))
        else:
            dynamodb.write_batch_data(
                self.pre_process(data=data,
                                 args=self.pre_process_args,
                                 kwargs=self.pre_process_kwargs))

        self.log.info('Done.')
Ejemplo n.º 2
0
def persist_data( **kwargs): 
    hook = AwsDynamoDBHook(table_name="TABLE_NAME", #TABLE_NAME
                            aws_conn_id='aws_default')
    faceIndexDetails = kwargs['ti'].xcom_pull(key='FaceIndexDetails')
    thumbnailDetails = kwargs['ti'].xcom_pull(key='ThumbnailDetails')
    conf = kwargs['dag_run'].conf
    dynamoItem = {
        "UserId" : conf["userId"],
        "s3Bucket" : conf["s3Bucket"],
        "s3Key": conf["s3Key"],
        "faceId" :faceIndexDetails['FaceId'],
        "thumbnail": thumbnailDetails['thumbnail']    
    }
    items = [dynamoItem]
    hook.write_batch_data(items)
Ejemplo n.º 3
0
    def test_insert_batch_items_dynamodb_table(self):

        hook = AwsDynamoDBHook(aws_conn_id='aws_default',
                               table_name='test_airflow',
                               table_keys=['id'],
                               region_name='us-east-1')

        # this table needs to be created in production
        table = hook.get_conn().create_table(
            TableName='test_airflow',
            KeySchema=[
                {
                    'AttributeName': 'id',
                    'KeyType': 'HASH'
                },
            ],
            AttributeDefinitions=[{
                'AttributeName': 'id',
                'AttributeType': 'S'
            }],
            ProvisionedThroughput={
                'ReadCapacityUnits': 10,
                'WriteCapacityUnits': 10
            },
        )

        table = hook.get_conn().Table('test_airflow')

        items = [{
            'id': str(uuid.uuid4()),
            'name': 'airflow'
        } for _ in range(10)]

        hook.write_batch_data(items)

        table.meta.client.get_waiter('table_exists').wait(
            TableName='test_airflow')
        self.assertEqual(table.item_count, 10)