def _handle_docker_concerns(self, cwl_image, eng_cfg, misc_log_group_name, output_config, syslog_log_group_name): if 'docker' in self.stacks_to_skip: _logger.info('skipping docker concerns') return _logger.info( 'configuring and building cloudwatch logs docker image (a special snowflake)' ) dart_root = dart_root_relative_path() r_id = random_id() values = (dart_root, r_id) call( 'cd %s && cd .. && git clone https://github.com/awslabs/ecs-cloudwatch-logs dart-cwl-%s' % values) docker_init = dart_root_relative_path('tools', 'docker', 'docker-local-init.sh') with open(dart_root_relative_path('aws', 'cloudwatch-logs', 'awslogs_template.conf')) as cwl_conf_template, \ open(dart_root_relative_path('..', 'dart-cwl-%s/awslogs.conf' % r_id), mode='w') as cwl_conf: contents = cwl_conf_template.read() contents = contents.replace('{DART_LOG_GROUP_SYSLOG}', syslog_log_group_name) contents = contents.replace('{DART_LOG_GROUP_MISC}', misc_log_group_name) cwl_conf.write(contents) cwl_root = dart_root_relative_path('..', 'dart-cwl-%s' % r_id) call('source %s && cd %s && docker build -f Dockerfile -t %s .' % (docker_init, cwl_root, cwl_image)) _logger.info('running grunt build') call('cd %s && grunt build' % dart_root_relative_path('src', 'python', 'dart', 'web', 'ui')) _logger.info('building other docker images') for repo_name in [ rn for rn in output_config['ecr']['repo_names'] if not rn.endswith('cloudwatchlogs') ]: version = eng_cfg['emr_engine']['options'][ 'impala_version'] if 'impala' in repo_name else '1.0.0' docker_img = self._docker_image(repo_name, output_config, version=version) docker_file_suffix = repo_name.split('/')[-1] values = (docker_init, dart_root, docker_file_suffix, docker_img) call( 'source %s && cd %s && docker build -f tools/docker/Dockerfile-%s -t %s .' % values) _logger.info('pushing docker images') cmd = ('source %s && cd %s && $(aws ecr get-login)' % (docker_init, dart_root)) + ' && docker push %s' for repo_name in output_config['ecr']['repo_names']: version = eng_cfg['emr_engine']['options'][ 'impala_version'] if 'impala' in repo_name else '1.0.0' call(cmd % self._docker_image(repo_name, output_config, version=version))
def test_hive_table_definition_step(self): ds = Dataset(data=DatasetData( name='weblogs_v01', table_name='weblogs', location='s3://wsm-log-servers/weblogs/www.retailmenot.com/ec2/', data_format=DataFormat( file_format=FileFormat.TEXTFILE, row_format=RowFormat.REGEX, regex_input="(?<ip>^(?:(?:unknown(?:,\\s)?|(?:\\d+\\.\\d+\\.\\d+\\.\\d+(?:,\\s)?))+)|\\S*)\\s+\\S+\\s+(?<userIdentifier>(?:[^\\[]+|\\$\\S+\\['\\S+'\\]|\\[username\\]))\\s*\\s+\\[(?<requestDate>[^\\]]+)\\]\\s+\"(?<httpMethod>(?:GET|HEAD|POST|PUT|DELETE|TRACE))\\s(?<urlPath>(?:[^ ?]+))(?:\\?(?<queryString>(?:[^ ]+)))?\\sHTTP/(?<httpVersion>(?:[\\d\\.]+))\"\\s+(?<statusCode>[0-9]+)\\s+(?<bytesSent>\\S+)\\s+\"(?<referrer>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userAgent>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+(?<responseTime>[-0-9]*)\\s+\"(?<hostName>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userFingerprint>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<sessionId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<requestId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<visitorId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<vegSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<fruitSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<cacheHitMiss>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s*\\Z", regex_output="%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21s", ), columns=[ Column('ip', 'STRING'), Column('user', 'STRING'), Column('requestDate', 'TIMESTAMP', date_pattern='dd/MMM/yyyy:HH:mm:ss Z'), Column('httpMethod', 'STRING'), Column('urlPath', 'STRING'), Column('queryString', 'STRING'), Column('httpVersion', 'STRING'), Column('statusCode', 'STRING'), Column('bytesSent', 'INT'), Column('referrer', 'STRING'), Column('userAgent', 'STRING'), Column('responseTime', 'BIGINT'), Column('hostname', 'STRING'), Column('userFingerprint', 'STRING'), Column('userId', 'STRING'), Column('sessionId', 'STRING'), Column('requestId', 'STRING'), Column('visitorId', 'STRING'), Column('vegSlice', 'STRING'), Column('fruitSlice', 'STRING'), Column('cacheHitMiss', 'STRING'), ], compression='GZIP', partitions=[ Column('year', 'STRING'), Column('week', 'STRING'), Column('day', 'STRING'), ], )) call('mkdir -p /tmp/dart-emr-test/hive/') this_path = os.path.dirname(os.path.abspath(__file__)) shutil.copyfile(this_path + '/../../../engine/emr/steps/hive/copy_to_table.hql', '/tmp/dart-emr-test/hive/copy_to_table.hql') hive_copy_to_table(ds, 'weblogs_stage', ds, 'weblogs', 's3://test', '/tmp/dart-emr-test/', 'actionid123', 1, 1) with open(os.path.join(this_path, 'copy_to_table_weblogs.hql')) as f: expected_contents = f.read() with open('/tmp/dart-emr-test/hive/copy_to_table_weblogs.hql') as f: actual_contents = f.read() self.assertEqual(expected_contents, actual_contents)
def test_impala_table_definition_step(self): ds = Dataset(data=DatasetData( name='weblogs_v01', table_name='weblogs_parquet', location='s3://wsm-log-servers/weblogs/www.retailmenot.com/ec2/', data_format=DataFormat( file_format=FileFormat.PARQUET, row_format=RowFormat.NONE, ), columns=[ Column('ip', 'STRING'), Column('user', 'STRING'), Column('requestDate', 'TIMESTAMP', date_pattern='dd/MMM/yyyy:HH:mm:ss Z'), Column('httpMethod', 'STRING'), Column('urlPath', 'STRING'), Column('queryString', 'STRING'), Column('httpVersion', 'STRING'), Column('statusCode', 'STRING'), Column('bytesSent', 'INT'), Column('referrer', 'STRING'), Column('userAgent', 'STRING'), Column('responseTime', 'BIGINT'), Column('hostname', 'STRING'), Column('userFingerprint', 'STRING'), Column('userId', 'STRING'), Column('sessionId', 'STRING'), Column('requestId', 'STRING'), Column('visitorId', 'STRING'), Column('vegSlice', 'STRING'), Column('fruitSlice', 'STRING'), Column('cacheHitMiss', 'STRING'), ], compression='GZIP', partitions=[ Column('year', 'STRING'), Column('week', 'STRING'), Column('day', 'STRING'), ], )) call('mkdir -p /tmp/dart-emr-test/impala/') this_path = os.path.dirname(os.path.abspath(__file__)) shutil.copyfile(this_path + '/../../../engine/emr/steps/impala/copy_to_table.sql', '/tmp/dart-emr-test/impala/copy_to_table.sql') impala_copy_to_table(ds, 'weblogs_stage', ds, 'weblogs_parquet', 's3://test', '/tmp/dart-emr-test/', 'actionid123', 1, 1) with open(os.path.join(this_path, 'copy_to_table_weblogs_parquet.sql')) as f: expected_contents = f.read() with open('/tmp/dart-emr-test/impala/copy_to_table_weblogs_parquet.sql') as f: actual_contents = f.read() self.assertEqual(expected_contents, actual_contents)
def set_dart_environment_variables(ecs_agent_data_path, container_id=None): ecs_data_path = ecs_agent_data_path or '' if not os.path.isfile(ecs_data_path): os.environ['DART_INSTANCE_ID'] = 'local-instance' os.environ['DART_CONTAINER_ID'] = 'local-container' os.environ['DART_ECS_CLUSTER'] = 'local-cluster' os.environ[ 'DART_ECS_CONTAINER_INSTANCE_ARN'] = 'local-containerinstancearn' os.environ['DART_ECS_FAMILY'] = 'local-family' os.environ['DART_ECS_TASK_ARN'] = 'local-task' return cmd = """ cat /proc/self/cgroup | grep "cpu:/" | sed 's/\([0-9]\):cpu:\/docker\///g' """ container_id = container_id if container_id else call(cmd).strip() # ECS sometimes takes a bit to write the state to the ecs_agent_data file, so we will pause for a moment time.sleep(10) with open(ecs_data_path) as f: data = json.load(f)['Data'] task_arn = data['TaskEngine']['IdToTask'][container_id] os.environ['DART_ECS_TASK_ARN'] = task_arn os.environ['DART_INSTANCE_ID'] = data['EC2InstanceID'] os.environ['DART_CONTAINER_ID'] = container_id os.environ['DART_ECS_CLUSTER'] = data['Cluster'] os.environ['DART_ECS_CONTAINER_INSTANCE_ARN'] = data[ 'ContainerInstanceArn'] for task in data['TaskEngine']['Tasks']: if task['Arn'] == task_arn: os.environ['DART_ECS_FAMILY'] = task['Family']
def create_cluster(bootstrap_actions_args, cluster_name, datastore, emr_engine, instance_groups_args, steps=None, auto_terminate=False): keyname = emr_engine.ec2_keyname instance_profile = emr_engine.instance_profile az = emr_engine.cluster_availability_zone cmd = 'aws emr create-cluster' \ ' --release-label {release_label}'\ ' --instance-type {instance_type}'\ ' --instance-count {instance_count}'\ ' --name {cluster_name}'\ ' --log-uri {log_uri}'\ ' --service-role {service_role}'\ ' --configurations {configurations}'\ ' --ec2-attributes {ec2_attributes}'\ ' --enable-debugging'\ ' --tags {tags}'\ ' --bootstrap-actions {bootstrap_actions}'\ ' --applications {applications}'\ ' {steps}'\ ' {auto_terminate}'\ '' cmd = cmd.format( release_label=datastore.data.args['release_label'], instance_type=datastore.data.args['instance_type'], instance_count=instance_groups_args[1][0] + 1, cluster_name=cluster_name, log_uri=datastore.data.s3_logs_path, service_role=emr_engine.service_role, configurations='file://%s/start_configs.json' % os.path.dirname(os.path.abspath(__file__)), ec2_attributes='KeyName=%s,AvailabilityZone=%s,InstanceProfile=%s' % (keyname, az, instance_profile), tags=' '.join([ '%s=%s' % (k, v) for k, v in emr_engine.cluster_tags.iteritems() ]), bootstrap_actions=' '.join([ 'Path="{path}",Name="{name}",Args=[{args}]'.format( name=a[0], path=a[1], args='' if len(a[2:]) <= 0 else '"%s"' % (','.join(a[2:]))) for a in bootstrap_actions_args ]), applications='Name=Hadoop Name=Hive Name=Spark', steps='--steps ' + ' '.join([ 'Name="{name}",Args=[{args}],Jar="{jar}",ActionOnFailure="{aof}",Type="CUSTOM_JAR"' .format(name='step-%s' % step_num, args='"%s"' % (','.join(s.args())) if s.args() else '', jar=s.jar(), aof=s.action_on_failure) for step_num, s in enumerate(steps) ]) if steps else '', auto_terminate='--auto-terminate' if auto_terminate else '') result = call(cmd) return json.loads(result)['ClusterId']
def create_cluster(bootstrap_actions_args, cluster_name, datastore, emr_engine, instance_groups_args, steps=None, auto_terminate=False, configuration_overrides=None): keyname = datastore.data.args['ec2_keyname'] if datastore.data.args.get('ec2_keyname') else emr_engine.ec2_keyname instance_profile = emr_engine.instance_profile subnet_id = emr_engine.subnet_id cmd = 'aws emr create-cluster' \ ' --release-label {release_label}'\ ' --instance-type {instance_type}'\ ' --instance-count {instance_count}'\ ' --name {cluster_name}'\ ' --log-uri {log_uri}'\ ' --service-role {service_role}'\ ' --configurations {configurations}'\ ' --ec2-attributes {ec2_attributes}'\ ' --enable-debugging'\ ' --tags {tags}'\ ' --bootstrap-actions {bootstrap_actions}'\ ' --applications {applications}'\ ' {steps}'\ ' {auto_terminate}'\ '' cmd = cmd.format( release_label=datastore.data.args['release_label'], instance_type=datastore.data.args['instance_type'], instance_count=instance_groups_args[1][0] + 1, cluster_name=cluster_name, log_uri=datastore.data.s3_logs_path, service_role=emr_engine.service_role, configurations=prepare_cluster_configurations(configuration_overrides), ec2_attributes='KeyName=%s,InstanceProfile=%s,SubnetId=%s' % ( keyname, instance_profile, subnet_id ), tags=' '.join(['%s=%s' % (k, v) for k, v in emr_engine.cluster_tags.iteritems()]), bootstrap_actions=' '.join([ 'Path="{path}",Name="{name}",Args=[{args}]'.format( name=a[0], path=a[1], args='' if len(a[2:]) <= 0 else '"%s"' % (','.join(a[2:]))) for a in bootstrap_actions_args ]), applications='Name=Hadoop Name=Hive Name=Spark', steps='--steps ' + ' '.join([ 'Name="{name}",Args=[{args}],Jar="{jar}",ActionOnFailure="{aof}",Type="CUSTOM_JAR"'.format( name='step-%s' % step_num, args='"%s"' % (','.join(s.args())) if s.args() else '', jar=s.jar(), aof=s.action_on_failure ) for step_num, s in enumerate(steps) ]) if steps else '', auto_terminate='--auto-terminate' if auto_terminate else '' ) result = call(cmd) return json.loads(result)['ClusterId']
def _handle_docker_concerns(self, cwl_image, eng_cfg, misc_log_group_name, output_config, syslog_log_group_name): if 'docker' in self.stacks_to_skip: _logger.info('skipping docker concerns') return _logger.info('configuring and building cloudwatch logs docker image (a special snowflake)') dart_root = dart_root_relative_path() r_id = random_id() values = (dart_root, r_id) call('cd %s && cd .. && git clone https://github.com/awslabs/ecs-cloudwatch-logs dart-cwl-%s' % values) docker_init = dart_root_relative_path('tools', 'docker', 'docker-local-init.sh') with open(dart_root_relative_path('aws', 'cloudwatch-logs', 'awslogs_template.conf')) as cwl_conf_template, \ open(dart_root_relative_path('..', 'dart-cwl-%s/awslogs.conf' % r_id), mode='w') as cwl_conf: contents = cwl_conf_template.read() contents = contents.replace('{DART_LOG_GROUP_SYSLOG}', syslog_log_group_name) contents = contents.replace('{DART_LOG_GROUP_MISC}', misc_log_group_name) cwl_conf.write(contents) cwl_root = dart_root_relative_path('..', 'dart-cwl-%s' % r_id) call('source %s && cd %s && docker build -f Dockerfile -t %s .' % (docker_init, cwl_root, cwl_image)) _logger.info('running grunt build') call('cd %s && grunt build' % dart_root_relative_path('src', 'python', 'dart', 'web', 'ui')) _logger.info('building other docker images') for repo_name in [rn for rn in output_config['ecr']['repo_names'] if not rn.endswith('cloudwatchlogs')]: version = eng_cfg['emr_engine']['options']['impala_version'] if 'impala' in repo_name else '1.0.0' docker_img = self._docker_image(repo_name, output_config, version=version) docker_file_suffix = repo_name.split('/')[-1] values = (docker_init, dart_root, docker_file_suffix, docker_img) call('source %s && cd %s && docker build -f tools/docker/Dockerfile-%s -t %s .' % values) _logger.info('pushing docker images') cmd = ('source %s && cd %s && $(aws ecr get-login)' % (docker_init, dart_root)) + ' && docker push %s' for repo_name in output_config['ecr']['repo_names']: version = eng_cfg['emr_engine']['options']['impala_version'] if 'impala' in repo_name else '1.0.0' call(cmd % self._docker_image(repo_name, output_config, version=version))
def create_cluster(bootstrap_actions_args, cluster_name, datastore, emr_engine, instance_groups_args): keyname = emr_engine.ec2_keyname instance_profile = emr_engine.instance_profile az = emr_engine.cluster_availability_zone cmd = 'aws emr create-cluster' \ ' --release-label {release_label}'\ ' --instance-type {instance_type}'\ ' --instance-count {instance_count}'\ ' --name {cluster_name}'\ ' --log-uri {log_uri}'\ ' --service-role {service_role}'\ ' --configurations {configurations}'\ ' --ec2-attributes {ec2_attributes}'\ ' --enable-debugging'\ ' --tags {tags}'\ ' --bootstrap-actions {bootstrap_actions}'\ ' --applications {applications}'\ '' cmd = cmd.format( release_label=datastore.data.args['release_label'], instance_type=datastore.data.args['instance_type'], instance_count=instance_groups_args[1][0] + 1, cluster_name=cluster_name, log_uri=datastore.data.s3_logs_path, service_role=emr_engine.service_role, configurations='file://%s/start_configs.json' % os.path.dirname(os.path.abspath(__file__)), ec2_attributes='KeyName=%s,AvailabilityZone=%s,InstanceProfile=%s' % (keyname, az, instance_profile), tags=' '.join(['%s=%s' % (k, v) for k, v in emr_engine.cluster_tags.iteritems()]), bootstrap_actions=' '.join([ 'Path="{path}",Name="{name}",Args=[{args}]'.format( name=a[0], path=a[1], args='' if len(a[2:]) <= 0 else '"%s"' % (','.join(a[2:]))) for a in bootstrap_actions_args ]), applications='Name=Hadoop Name=Hive Name=Spark', ) result = call(cmd) return json.loads(result)['ClusterId']
def s3_copy_recursive(source_path, dest_path): call('aws s3 cp --recursive %s %s' % (source_path, dest_path))
def test_hive_table_definition_step(self): ds = Dataset(data=DatasetData( name='weblogs_v01', table_name='weblogs', location='s3://wsm-log-servers/weblogs/www.retailmenot.com/ec2/', data_format=DataFormat( file_format=FileFormat.TEXTFILE, row_format=RowFormat.REGEX, regex_input= "(?<ip>^(?:(?:unknown(?:,\\s)?|(?:\\d+\\.\\d+\\.\\d+\\.\\d+(?:,\\s)?))+)|\\S*)\\s+\\S+\\s+(?<userIdentifier>(?:[^\\[]+|\\$\\S+\\['\\S+'\\]|\\[username\\]))\\s*\\s+\\[(?<requestDate>[^\\]]+)\\]\\s+\"(?<httpMethod>(?:GET|HEAD|POST|PUT|DELETE|TRACE))\\s(?<urlPath>(?:[^ ?]+))(?:\\?(?<queryString>(?:[^ ]+)))?\\sHTTP/(?<httpVersion>(?:[\\d\\.]+))\"\\s+(?<statusCode>[0-9]+)\\s+(?<bytesSent>\\S+)\\s+\"(?<referrer>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userAgent>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+(?<responseTime>[-0-9]*)\\s+\"(?<hostName>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userFingerprint>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<sessionId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<requestId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<visitorId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<vegSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<fruitSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<cacheHitMiss>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s*\\Z", regex_output= "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21s", ), columns=[ Column('ip', 'STRING'), Column('user', 'STRING'), Column('requestDate', 'TIMESTAMP', date_pattern='dd/MMM/yyyy:HH:mm:ss Z'), Column('httpMethod', 'STRING'), Column('urlPath', 'STRING'), Column('queryString', 'STRING'), Column('httpVersion', 'STRING'), Column('statusCode', 'STRING'), Column('bytesSent', 'INT'), Column('referrer', 'STRING'), Column('userAgent', 'STRING'), Column('responseTime', 'BIGINT'), Column('hostname', 'STRING'), Column('userFingerprint', 'STRING'), Column('userId', 'STRING'), Column('sessionId', 'STRING'), Column('requestId', 'STRING'), Column('visitorId', 'STRING'), Column('vegSlice', 'STRING'), Column('fruitSlice', 'STRING'), Column('cacheHitMiss', 'STRING'), ], compression='GZIP', partitions=[ Column('year', 'STRING'), Column('week', 'STRING'), Column('day', 'STRING'), ], )) call('mkdir -p /tmp/dart-emr-test/hive/') this_path = os.path.dirname(os.path.abspath(__file__)) shutil.copyfile( this_path + '/../../../engine/emr/steps/hive/copy_to_table.hql', '/tmp/dart-emr-test/hive/copy_to_table.hql') hive_copy_to_table(ds, 'weblogs_stage', ds, 'weblogs', 's3://test', '/tmp/dart-emr-test/', 'actionid123', None, 1, 1) with open(os.path.join(this_path, 'copy_to_table_weblogs.hql')) as f: expected_contents = f.read() with open('/tmp/dart-emr-test/hive/copy_to_table_weblogs.hql') as f: actual_contents = f.read() self.assertEqual(expected_contents, actual_contents)
def test_hive_table_definition_step(self): ds = Dataset(data=DatasetData( name='owen_eu_v01', table_name='owen_eu', location='s3://s3-rpt-uss-dat-warehouse/prd/inbound/overlord/eu-all-events', data_format=DataFormat( file_format=FileFormat.TEXTFILE, row_format=RowFormat.JSON, ), columns=[ Column('host', 'STRING', path='metadata.host'), Column('pageName', 'STRING', path='owen.context.pageName'), Column('viewInstanceUuid', 'STRING', path='owen.context.viewInstanceUuid'), Column('previousPageName', 'STRING', path='owen.context.previousPageName'), Column('previousViewInstanceUuid', 'STRING', path='owen.context.previousViewInstanceUuid'), Column('session', 'STRING', path='owen.context.session'), Column('pageType', 'STRING', path='owen.context.pageType'), Column('propertyName', 'STRING', path='owen.context.propertyName'), Column('enviroment', 'STRING', path='owen.context.environment'), Column('appForegroundFlag', 'BOOLEAN', path='owen.context.appForegroundFlag'), Column('bluetoothEnabledFlag', 'BOOLEAN', path='owen.context.bluetoothEnabledFlag'), Column('favoriteFlag', 'BOOLEAN', path='owen.context.favoriteFlag'), Column('locationEnabledFlag', 'BOOLEAN', path='owen.context.locationEnabledFlag'), Column('loggedInFlag', 'BOOLEAN', path='owen.context.loggedInFlag'), Column('notificationEnabledFlag', 'BOOLEAN', path='owen.context.notificationEnabledFlag'), Column('personalizationFlag', 'BOOLEAN', path='owen.context.personalizationFlag'), Column('advertiserUuid', 'STRING', path='owen.context.advertiserUuid'), Column('udid', 'STRING', path='owen.context.udid'), Column('userQualifier', 'STRING', path='owen.context.userQualifier'), Column('userId', 'STRING', path='owen.context.custom.legacy.userId'), Column('userUuid', 'STRING', path='owen.context.userUuid'), Column('macAddress', 'STRING', path='owen.context.macAddress'), Column('ipAddress', 'STRING', path='owen.context.ipAddress'), Column('osVersion', 'STRING', path='owen.context.osVersion'), Column('osFamily', 'STRING', path='owen.context.osFamily'), Column('osName', 'STRING', path='owen.context.osName'), Column('browserFamily', 'STRING', path='owen.context.browserFamily'), Column('deviceCategory', 'STRING', path='owen.context.deviceCategory'), Column('deviceMake', 'STRING', path='owen.context.mobileDeviceMake'), Column('deviceModel', 'STRING', path='owen.context.mobileDeviceModel'), Column('connectionType', 'STRING', path='owen.context.connectionType'), Column('userAgent', 'STRING', path='owen.context.userAgent'), Column('geofenceId', 'STRING', path='owen.context.custom.legacy.geofenceId'), Column('eventTimestamp', 'TIMESTAMP', path='owen.event.eventTimestamp', date_pattern="yyyy-MM-dd'T'HH:mm:ssZ"), Column('eventInstanceUuid', 'STRING', path='owen.event.eventInstanceUuid'), Column('eventPlatformVersion', 'STRING', path='owen.event.eventPlatformVersion'), Column('eventVersion', 'STRING', path='owen.event.eventVersion'), Column('eventCategory', 'STRING', path='owen.event.eventCategory'), Column('eventName', 'STRING', path='owen.event.eventName'), Column('eventAction', 'STRING', path='owen.event.eventAction'), Column('eventPlatform', 'STRING', path='owen.event.eventPlatform'), Column('testUnixTimestampSecondsPattern', 'TIMESTAMP', path='some.fake.path.testUnixTimestampSecondsPattern', date_pattern='UNIX_TIMESTAMP_SECONDS'), Column('testUnixTimestampMillisPattern', 'TIMESTAMP', path='some.fake.path.testUnixTimestampMillisPattern', date_pattern='UNIX_TIMESTAMP_MILLIS'), ], compression='GZIP', partitions=[ Column('year', 'STRING'), Column('week', 'STRING'), Column('day', 'STRING'), ], )) call('mkdir -p /tmp/dart-emr-test/hive/') this_path = os.path.dirname(os.path.abspath(__file__)) shutil.copyfile(this_path + '/../../../engine/emr/steps/hive/copy_to_table.hql', '/tmp/dart-emr-test/hive/copy_to_table.hql') action_id = 'actionid123' target_dataset = Dataset.from_dict(ds.to_dict()) target_dataset.data.data_format.num_header_rows = 0 target_dataset.data.data_format = DataFormat(FileFormat.RCFILE, RowFormat.NONE) stage_dataset = Dataset.from_dict(ds.to_dict()) assert isinstance(stage_dataset, Dataset) for c in stage_dataset.data.columns: c.data_type = DataType.STRING hive_copy_to_table(stage_dataset, 'owen_eu_stage', target_dataset, 'owen_eu', 's3://test', '/tmp/dart-emr-test/', action_id, None, 1, 1) with open(os.path.join(this_path, 'copy_to_table_owen_eu.hql')) as f: expected_contents = f.read() with open('/tmp/dart-emr-test/hive/copy_to_table_owen_eu.hql') as f: actual_contents = f.read() self.assertEqual(expected_contents, actual_contents)