def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) self.log.info('Extracting data from Hive') self.log.info(self.sql) data = hive.get_pandas_df(self.sql, schema=self.schema) dynamodb = AwsDynamoDBHook( aws_conn_id=self.aws_conn_id, table_name=self.table_name, table_keys=self.table_keys, region_name=self.region_name, ) self.log.info('Inserting rows into dynamodb') if self.pre_process is None: dynamodb.write_batch_data( json.loads(data.to_json(orient='records'))) else: dynamodb.write_batch_data( self.pre_process(data=data, args=self.pre_process_args, kwargs=self.pre_process_kwargs)) self.log.info('Done.')
def persist_data( **kwargs): hook = AwsDynamoDBHook(table_name="TABLE_NAME", #TABLE_NAME aws_conn_id='aws_default') faceIndexDetails = kwargs['ti'].xcom_pull(key='FaceIndexDetails') thumbnailDetails = kwargs['ti'].xcom_pull(key='ThumbnailDetails') conf = kwargs['dag_run'].conf dynamoItem = { "UserId" : conf["userId"], "s3Bucket" : conf["s3Bucket"], "s3Key": conf["s3Key"], "faceId" :faceIndexDetails['FaceId'], "thumbnail": thumbnailDetails['thumbnail'] } items = [dynamoItem] hook.write_batch_data(items)
def test_insert_batch_items_dynamodb_table(self): hook = AwsDynamoDBHook(aws_conn_id='aws_default', table_name='test_airflow', table_keys=['id'], region_name='us-east-1') # this table needs to be created in production table = hook.get_conn().create_table( TableName='test_airflow', KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[{ 'AttributeName': 'id', 'AttributeType': 'S' }], ProvisionedThroughput={ 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 }, ) table = hook.get_conn().Table('test_airflow') items = [{ 'id': str(uuid.uuid4()), 'name': 'airflow' } for _ in range(10)] hook.write_batch_data(items) table.meta.client.get_waiter('table_exists').wait( TableName='test_airflow') self.assertEqual(table.item_count, 10)