Example #1
0
    def _handle_docker_concerns(self, cwl_image, eng_cfg, misc_log_group_name,
                                output_config, syslog_log_group_name):
        if 'docker' in self.stacks_to_skip:
            _logger.info('skipping docker concerns')
            return

        _logger.info(
            'configuring and building cloudwatch logs docker image (a special snowflake)'
        )
        dart_root = dart_root_relative_path()
        r_id = random_id()
        values = (dart_root, r_id)
        call(
            'cd %s && cd .. && git clone https://github.com/awslabs/ecs-cloudwatch-logs dart-cwl-%s'
            % values)
        docker_init = dart_root_relative_path('tools', 'docker',
                                              'docker-local-init.sh')
        with open(dart_root_relative_path('aws', 'cloudwatch-logs', 'awslogs_template.conf')) as cwl_conf_template, \
                open(dart_root_relative_path('..', 'dart-cwl-%s/awslogs.conf' % r_id), mode='w') as cwl_conf:
            contents = cwl_conf_template.read()
            contents = contents.replace('{DART_LOG_GROUP_SYSLOG}',
                                        syslog_log_group_name)
            contents = contents.replace('{DART_LOG_GROUP_MISC}',
                                        misc_log_group_name)
            cwl_conf.write(contents)
        cwl_root = dart_root_relative_path('..', 'dart-cwl-%s' % r_id)
        call('source %s && cd %s && docker build -f Dockerfile -t %s .' %
             (docker_init, cwl_root, cwl_image))

        _logger.info('running grunt build')
        call('cd %s && grunt build' %
             dart_root_relative_path('src', 'python', 'dart', 'web', 'ui'))

        _logger.info('building other docker images')
        for repo_name in [
                rn for rn in output_config['ecr']['repo_names']
                if not rn.endswith('cloudwatchlogs')
        ]:
            version = eng_cfg['emr_engine']['options'][
                'impala_version'] if 'impala' in repo_name else '1.0.0'
            docker_img = self._docker_image(repo_name,
                                            output_config,
                                            version=version)
            docker_file_suffix = repo_name.split('/')[-1]
            values = (docker_init, dart_root, docker_file_suffix, docker_img)
            call(
                'source %s && cd %s && docker build -f tools/docker/Dockerfile-%s -t %s .'
                % values)

        _logger.info('pushing docker images')
        cmd = ('source %s && cd %s && $(aws ecr get-login)' %
               (docker_init, dart_root)) + ' && docker push %s'
        for repo_name in output_config['ecr']['repo_names']:
            version = eng_cfg['emr_engine']['options'][
                'impala_version'] if 'impala' in repo_name else '1.0.0'
            call(cmd %
                 self._docker_image(repo_name, output_config, version=version))
    def test_hive_table_definition_step(self):
        ds = Dataset(data=DatasetData(
            name='weblogs_v01',
            table_name='weblogs',
            location='s3://wsm-log-servers/weblogs/www.retailmenot.com/ec2/',
            data_format=DataFormat(
                file_format=FileFormat.TEXTFILE,
                row_format=RowFormat.REGEX,
                regex_input="(?<ip>^(?:(?:unknown(?:,\\s)?|(?:\\d+\\.\\d+\\.\\d+\\.\\d+(?:,\\s)?))+)|\\S*)\\s+\\S+\\s+(?<userIdentifier>(?:[^\\[]+|\\$\\S+\\['\\S+'\\]|\\[username\\]))\\s*\\s+\\[(?<requestDate>[^\\]]+)\\]\\s+\"(?<httpMethod>(?:GET|HEAD|POST|PUT|DELETE|TRACE))\\s(?<urlPath>(?:[^ ?]+))(?:\\?(?<queryString>(?:[^ ]+)))?\\sHTTP/(?<httpVersion>(?:[\\d\\.]+))\"\\s+(?<statusCode>[0-9]+)\\s+(?<bytesSent>\\S+)\\s+\"(?<referrer>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userAgent>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+(?<responseTime>[-0-9]*)\\s+\"(?<hostName>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userFingerprint>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<sessionId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<requestId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<visitorId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<vegSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<fruitSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<cacheHitMiss>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s*\\Z",
                regex_output="%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21s",
            ),
            columns=[
                Column('ip', 'STRING'),
                Column('user', 'STRING'),
                Column('requestDate', 'TIMESTAMP', date_pattern='dd/MMM/yyyy:HH:mm:ss Z'),
                Column('httpMethod', 'STRING'),
                Column('urlPath', 'STRING'),
                Column('queryString', 'STRING'),
                Column('httpVersion', 'STRING'),
                Column('statusCode', 'STRING'),
                Column('bytesSent', 'INT'),
                Column('referrer', 'STRING'),
                Column('userAgent', 'STRING'),
                Column('responseTime', 'BIGINT'),
                Column('hostname', 'STRING'),
                Column('userFingerprint', 'STRING'),
                Column('userId', 'STRING'),
                Column('sessionId', 'STRING'),
                Column('requestId', 'STRING'),
                Column('visitorId', 'STRING'),
                Column('vegSlice', 'STRING'),
                Column('fruitSlice', 'STRING'),
                Column('cacheHitMiss', 'STRING'),
            ],
            compression='GZIP',
            partitions=[
                Column('year', 'STRING'),
                Column('week', 'STRING'),
                Column('day', 'STRING'),
            ],
        ))

        call('mkdir -p /tmp/dart-emr-test/hive/')
        this_path = os.path.dirname(os.path.abspath(__file__))
        shutil.copyfile(this_path + '/../../../engine/emr/steps/hive/copy_to_table.hql', '/tmp/dart-emr-test/hive/copy_to_table.hql')
        hive_copy_to_table(ds, 'weblogs_stage', ds, 'weblogs', 's3://test', '/tmp/dart-emr-test/', 'actionid123', 1, 1)

        with open(os.path.join(this_path, 'copy_to_table_weblogs.hql')) as f:
            expected_contents = f.read()

        with open('/tmp/dart-emr-test/hive/copy_to_table_weblogs.hql') as f:
            actual_contents = f.read()

        self.assertEqual(expected_contents, actual_contents)
    def test_impala_table_definition_step(self):
        ds = Dataset(data=DatasetData(
            name='weblogs_v01',
            table_name='weblogs_parquet',
            location='s3://wsm-log-servers/weblogs/www.retailmenot.com/ec2/',
            data_format=DataFormat(
                file_format=FileFormat.PARQUET,
                row_format=RowFormat.NONE,
            ),
            columns=[
                Column('ip', 'STRING'),
                Column('user', 'STRING'),
                Column('requestDate', 'TIMESTAMP', date_pattern='dd/MMM/yyyy:HH:mm:ss Z'),
                Column('httpMethod', 'STRING'),
                Column('urlPath', 'STRING'),
                Column('queryString', 'STRING'),
                Column('httpVersion', 'STRING'),
                Column('statusCode', 'STRING'),
                Column('bytesSent', 'INT'),
                Column('referrer', 'STRING'),
                Column('userAgent', 'STRING'),
                Column('responseTime', 'BIGINT'),
                Column('hostname', 'STRING'),
                Column('userFingerprint', 'STRING'),
                Column('userId', 'STRING'),
                Column('sessionId', 'STRING'),
                Column('requestId', 'STRING'),
                Column('visitorId', 'STRING'),
                Column('vegSlice', 'STRING'),
                Column('fruitSlice', 'STRING'),
                Column('cacheHitMiss', 'STRING'),
            ],
            compression='GZIP',
            partitions=[
                Column('year', 'STRING'),
                Column('week', 'STRING'),
                Column('day', 'STRING'),
            ],
        ))

        call('mkdir -p /tmp/dart-emr-test/impala/')
        this_path = os.path.dirname(os.path.abspath(__file__))
        shutil.copyfile(this_path + '/../../../engine/emr/steps/impala/copy_to_table.sql', '/tmp/dart-emr-test/impala/copy_to_table.sql')
        impala_copy_to_table(ds, 'weblogs_stage', ds, 'weblogs_parquet', 's3://test', '/tmp/dart-emr-test/', 'actionid123', 1, 1)

        with open(os.path.join(this_path, 'copy_to_table_weblogs_parquet.sql')) as f:
            expected_contents = f.read()

        with open('/tmp/dart-emr-test/impala/copy_to_table_weblogs_parquet.sql') as f:
            actual_contents = f.read()

        self.assertEqual(expected_contents, actual_contents)
Example #4
0
def set_dart_environment_variables(ecs_agent_data_path, container_id=None):
    ecs_data_path = ecs_agent_data_path or ''
    if not os.path.isfile(ecs_data_path):
        os.environ['DART_INSTANCE_ID'] = 'local-instance'
        os.environ['DART_CONTAINER_ID'] = 'local-container'
        os.environ['DART_ECS_CLUSTER'] = 'local-cluster'
        os.environ[
            'DART_ECS_CONTAINER_INSTANCE_ARN'] = 'local-containerinstancearn'
        os.environ['DART_ECS_FAMILY'] = 'local-family'
        os.environ['DART_ECS_TASK_ARN'] = 'local-task'
        return

    cmd = """ cat /proc/self/cgroup | grep "cpu:/" | sed 's/\([0-9]\):cpu:\/docker\///g' """
    container_id = container_id if container_id else call(cmd).strip()

    # ECS sometimes takes a bit to write the state to the ecs_agent_data file, so we will pause for a moment
    time.sleep(10)

    with open(ecs_data_path) as f:
        data = json.load(f)['Data']
        task_arn = data['TaskEngine']['IdToTask'][container_id]
        os.environ['DART_ECS_TASK_ARN'] = task_arn
        os.environ['DART_INSTANCE_ID'] = data['EC2InstanceID']
        os.environ['DART_CONTAINER_ID'] = container_id
        os.environ['DART_ECS_CLUSTER'] = data['Cluster']
        os.environ['DART_ECS_CONTAINER_INSTANCE_ARN'] = data[
            'ContainerInstanceArn']

        for task in data['TaskEngine']['Tasks']:
            if task['Arn'] == task_arn:
                os.environ['DART_ECS_FAMILY'] = task['Family']
Example #5
0
def create_cluster(bootstrap_actions_args,
                   cluster_name,
                   datastore,
                   emr_engine,
                   instance_groups_args,
                   steps=None,
                   auto_terminate=False):
    keyname = emr_engine.ec2_keyname
    instance_profile = emr_engine.instance_profile
    az = emr_engine.cluster_availability_zone
    cmd = 'aws emr create-cluster' \
          ' --release-label {release_label}'\
          ' --instance-type {instance_type}'\
          ' --instance-count {instance_count}'\
          ' --name {cluster_name}'\
          ' --log-uri {log_uri}'\
          ' --service-role {service_role}'\
          ' --configurations {configurations}'\
          ' --ec2-attributes {ec2_attributes}'\
          ' --enable-debugging'\
          ' --tags {tags}'\
          ' --bootstrap-actions {bootstrap_actions}'\
          ' --applications {applications}'\
          ' {steps}'\
          ' {auto_terminate}'\
          ''
    cmd = cmd.format(
        release_label=datastore.data.args['release_label'],
        instance_type=datastore.data.args['instance_type'],
        instance_count=instance_groups_args[1][0] + 1,
        cluster_name=cluster_name,
        log_uri=datastore.data.s3_logs_path,
        service_role=emr_engine.service_role,
        configurations='file://%s/start_configs.json' %
        os.path.dirname(os.path.abspath(__file__)),
        ec2_attributes='KeyName=%s,AvailabilityZone=%s,InstanceProfile=%s' %
        (keyname, az, instance_profile),
        tags=' '.join([
            '%s=%s' % (k, v) for k, v in emr_engine.cluster_tags.iteritems()
        ]),
        bootstrap_actions=' '.join([
            'Path="{path}",Name="{name}",Args=[{args}]'.format(
                name=a[0],
                path=a[1],
                args='' if len(a[2:]) <= 0 else '"%s"' % (','.join(a[2:])))
            for a in bootstrap_actions_args
        ]),
        applications='Name=Hadoop Name=Hive Name=Spark',
        steps='--steps ' + ' '.join([
            'Name="{name}",Args=[{args}],Jar="{jar}",ActionOnFailure="{aof}",Type="CUSTOM_JAR"'
            .format(name='step-%s' % step_num,
                    args='"%s"' % (','.join(s.args())) if s.args() else '',
                    jar=s.jar(),
                    aof=s.action_on_failure)
            for step_num, s in enumerate(steps)
        ]) if steps else '',
        auto_terminate='--auto-terminate' if auto_terminate else '')
    result = call(cmd)
    return json.loads(result)['ClusterId']
def create_cluster(bootstrap_actions_args, cluster_name, datastore, emr_engine, instance_groups_args,
                   steps=None, auto_terminate=False, configuration_overrides=None):
    keyname = datastore.data.args['ec2_keyname'] if datastore.data.args.get('ec2_keyname') else emr_engine.ec2_keyname
    instance_profile = emr_engine.instance_profile
    subnet_id = emr_engine.subnet_id
    cmd = 'aws emr create-cluster' \
          ' --release-label {release_label}'\
          ' --instance-type {instance_type}'\
          ' --instance-count {instance_count}'\
          ' --name {cluster_name}'\
          ' --log-uri {log_uri}'\
          ' --service-role {service_role}'\
          ' --configurations {configurations}'\
          ' --ec2-attributes {ec2_attributes}'\
          ' --enable-debugging'\
          ' --tags {tags}'\
          ' --bootstrap-actions {bootstrap_actions}'\
          ' --applications {applications}'\
          ' {steps}'\
          ' {auto_terminate}'\
          ''
    cmd = cmd.format(
        release_label=datastore.data.args['release_label'],
        instance_type=datastore.data.args['instance_type'],
        instance_count=instance_groups_args[1][0] + 1,
        cluster_name=cluster_name,
        log_uri=datastore.data.s3_logs_path,
        service_role=emr_engine.service_role,
        configurations=prepare_cluster_configurations(configuration_overrides),
        ec2_attributes='KeyName=%s,InstanceProfile=%s,SubnetId=%s' % (
            keyname, instance_profile, subnet_id
        ),
        tags=' '.join(['%s=%s' % (k, v) for k, v in emr_engine.cluster_tags.iteritems()]),
        bootstrap_actions=' '.join([
            'Path="{path}",Name="{name}",Args=[{args}]'.format(
                name=a[0],
                path=a[1],
                args='' if len(a[2:]) <= 0 else '"%s"' % (','.join(a[2:]))) for a in bootstrap_actions_args
        ]),
        applications='Name=Hadoop Name=Hive Name=Spark',
        steps='--steps ' + ' '.join([
            'Name="{name}",Args=[{args}],Jar="{jar}",ActionOnFailure="{aof}",Type="CUSTOM_JAR"'.format(
                name='step-%s' % step_num,
                args='"%s"' % (','.join(s.args())) if s.args() else '',
                jar=s.jar(),
                aof=s.action_on_failure
            )
            for step_num, s in enumerate(steps)
        ]) if steps else '',
        auto_terminate='--auto-terminate' if auto_terminate else ''
    )
    result = call(cmd)
    return json.loads(result)['ClusterId']
    def _handle_docker_concerns(self, cwl_image, eng_cfg, misc_log_group_name, output_config, syslog_log_group_name):
        if 'docker' in self.stacks_to_skip:
            _logger.info('skipping docker concerns')
            return

        _logger.info('configuring and building cloudwatch logs docker image (a special snowflake)')
        dart_root = dart_root_relative_path()
        r_id = random_id()
        values = (dart_root, r_id)
        call('cd %s && cd .. && git clone https://github.com/awslabs/ecs-cloudwatch-logs dart-cwl-%s' % values)
        docker_init = dart_root_relative_path('tools', 'docker', 'docker-local-init.sh')
        with open(dart_root_relative_path('aws', 'cloudwatch-logs', 'awslogs_template.conf')) as cwl_conf_template, \
                open(dart_root_relative_path('..', 'dart-cwl-%s/awslogs.conf' % r_id), mode='w') as cwl_conf:
            contents = cwl_conf_template.read()
            contents = contents.replace('{DART_LOG_GROUP_SYSLOG}', syslog_log_group_name)
            contents = contents.replace('{DART_LOG_GROUP_MISC}', misc_log_group_name)
            cwl_conf.write(contents)
        cwl_root = dart_root_relative_path('..', 'dart-cwl-%s' % r_id)
        call('source %s && cd %s && docker build -f Dockerfile -t %s .' % (docker_init, cwl_root, cwl_image))

        _logger.info('running grunt build')
        call('cd %s && grunt build' % dart_root_relative_path('src', 'python', 'dart', 'web', 'ui'))

        _logger.info('building other docker images')
        for repo_name in [rn for rn in output_config['ecr']['repo_names'] if not rn.endswith('cloudwatchlogs')]:
            version = eng_cfg['emr_engine']['options']['impala_version'] if 'impala' in repo_name else '1.0.0'
            docker_img = self._docker_image(repo_name, output_config, version=version)
            docker_file_suffix = repo_name.split('/')[-1]
            values = (docker_init, dart_root, docker_file_suffix, docker_img)
            call('source %s && cd %s && docker build -f tools/docker/Dockerfile-%s -t %s .' % values)

        _logger.info('pushing docker images')
        cmd = ('source %s && cd %s && $(aws ecr get-login)' % (docker_init, dart_root)) + ' && docker push %s'
        for repo_name in output_config['ecr']['repo_names']:
            version = eng_cfg['emr_engine']['options']['impala_version'] if 'impala' in repo_name else '1.0.0'
            call(cmd % self._docker_image(repo_name, output_config, version=version))
Example #8
0
def create_cluster(bootstrap_actions_args, cluster_name, datastore, emr_engine, instance_groups_args):
    keyname = emr_engine.ec2_keyname
    instance_profile = emr_engine.instance_profile
    az = emr_engine.cluster_availability_zone
    cmd = 'aws emr create-cluster' \
          ' --release-label {release_label}'\
          ' --instance-type {instance_type}'\
          ' --instance-count {instance_count}'\
          ' --name {cluster_name}'\
          ' --log-uri {log_uri}'\
          ' --service-role {service_role}'\
          ' --configurations {configurations}'\
          ' --ec2-attributes {ec2_attributes}'\
          ' --enable-debugging'\
          ' --tags {tags}'\
          ' --bootstrap-actions {bootstrap_actions}'\
          ' --applications {applications}'\
          ''
    cmd = cmd.format(
        release_label=datastore.data.args['release_label'],
        instance_type=datastore.data.args['instance_type'],
        instance_count=instance_groups_args[1][0] + 1,
        cluster_name=cluster_name,
        log_uri=datastore.data.s3_logs_path,
        service_role=emr_engine.service_role,
        configurations='file://%s/start_configs.json' % os.path.dirname(os.path.abspath(__file__)),
        ec2_attributes='KeyName=%s,AvailabilityZone=%s,InstanceProfile=%s' % (keyname, az, instance_profile),
        tags=' '.join(['%s=%s' % (k, v) for k, v in emr_engine.cluster_tags.iteritems()]),
        bootstrap_actions=' '.join([
            'Path="{path}",Name="{name}",Args=[{args}]'.format(
                name=a[0],
                path=a[1],
                args='' if len(a[2:]) <= 0 else '"%s"' % (','.join(a[2:]))) for a in bootstrap_actions_args
        ]),
        applications='Name=Hadoop Name=Hive Name=Spark',
    )
    result = call(cmd)
    return json.loads(result)['ClusterId']
Example #9
0
def s3_copy_recursive(source_path, dest_path):
    call('aws s3 cp --recursive %s %s' % (source_path, dest_path))
Example #10
0
    def test_hive_table_definition_step(self):
        ds = Dataset(data=DatasetData(
            name='weblogs_v01',
            table_name='weblogs',
            location='s3://wsm-log-servers/weblogs/www.retailmenot.com/ec2/',
            data_format=DataFormat(
                file_format=FileFormat.TEXTFILE,
                row_format=RowFormat.REGEX,
                regex_input=
                "(?<ip>^(?:(?:unknown(?:,\\s)?|(?:\\d+\\.\\d+\\.\\d+\\.\\d+(?:,\\s)?))+)|\\S*)\\s+\\S+\\s+(?<userIdentifier>(?:[^\\[]+|\\$\\S+\\['\\S+'\\]|\\[username\\]))\\s*\\s+\\[(?<requestDate>[^\\]]+)\\]\\s+\"(?<httpMethod>(?:GET|HEAD|POST|PUT|DELETE|TRACE))\\s(?<urlPath>(?:[^ ?]+))(?:\\?(?<queryString>(?:[^ ]+)))?\\sHTTP/(?<httpVersion>(?:[\\d\\.]+))\"\\s+(?<statusCode>[0-9]+)\\s+(?<bytesSent>\\S+)\\s+\"(?<referrer>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userAgent>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+(?<responseTime>[-0-9]*)\\s+\"(?<hostName>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userFingerprint>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<sessionId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<requestId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<visitorId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<vegSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<fruitSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<cacheHitMiss>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s*\\Z",
                regex_output=
                "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21s",
            ),
            columns=[
                Column('ip', 'STRING'),
                Column('user', 'STRING'),
                Column('requestDate',
                       'TIMESTAMP',
                       date_pattern='dd/MMM/yyyy:HH:mm:ss Z'),
                Column('httpMethod', 'STRING'),
                Column('urlPath', 'STRING'),
                Column('queryString', 'STRING'),
                Column('httpVersion', 'STRING'),
                Column('statusCode', 'STRING'),
                Column('bytesSent', 'INT'),
                Column('referrer', 'STRING'),
                Column('userAgent', 'STRING'),
                Column('responseTime', 'BIGINT'),
                Column('hostname', 'STRING'),
                Column('userFingerprint', 'STRING'),
                Column('userId', 'STRING'),
                Column('sessionId', 'STRING'),
                Column('requestId', 'STRING'),
                Column('visitorId', 'STRING'),
                Column('vegSlice', 'STRING'),
                Column('fruitSlice', 'STRING'),
                Column('cacheHitMiss', 'STRING'),
            ],
            compression='GZIP',
            partitions=[
                Column('year', 'STRING'),
                Column('week', 'STRING'),
                Column('day', 'STRING'),
            ],
        ))

        call('mkdir -p /tmp/dart-emr-test/hive/')
        this_path = os.path.dirname(os.path.abspath(__file__))
        shutil.copyfile(
            this_path + '/../../../engine/emr/steps/hive/copy_to_table.hql',
            '/tmp/dart-emr-test/hive/copy_to_table.hql')
        hive_copy_to_table(ds, 'weblogs_stage', ds, 'weblogs', 's3://test',
                           '/tmp/dart-emr-test/', 'actionid123', None, 1, 1)

        with open(os.path.join(this_path, 'copy_to_table_weblogs.hql')) as f:
            expected_contents = f.read()

        with open('/tmp/dart-emr-test/hive/copy_to_table_weblogs.hql') as f:
            actual_contents = f.read()

        self.assertEqual(expected_contents, actual_contents)
Example #11
0
def s3_copy_recursive(source_path, dest_path):
    call('aws s3 cp --recursive %s %s' % (source_path, dest_path))
    def test_hive_table_definition_step(self):
        ds = Dataset(data=DatasetData(
            name='owen_eu_v01',
            table_name='owen_eu',
            location='s3://s3-rpt-uss-dat-warehouse/prd/inbound/overlord/eu-all-events',
            data_format=DataFormat(
                file_format=FileFormat.TEXTFILE,
                row_format=RowFormat.JSON,
            ),
            columns=[
                Column('host', 'STRING', path='metadata.host'),
                Column('pageName', 'STRING', path='owen.context.pageName'),
                Column('viewInstanceUuid', 'STRING', path='owen.context.viewInstanceUuid'),
                Column('previousPageName', 'STRING', path='owen.context.previousPageName'),
                Column('previousViewInstanceUuid', 'STRING', path='owen.context.previousViewInstanceUuid'),
                Column('session', 'STRING', path='owen.context.session'),
                Column('pageType', 'STRING', path='owen.context.pageType'),
                Column('propertyName', 'STRING', path='owen.context.propertyName'),
                Column('enviroment', 'STRING', path='owen.context.environment'),
                Column('appForegroundFlag', 'BOOLEAN', path='owen.context.appForegroundFlag'),
                Column('bluetoothEnabledFlag', 'BOOLEAN', path='owen.context.bluetoothEnabledFlag'),
                Column('favoriteFlag', 'BOOLEAN', path='owen.context.favoriteFlag'),
                Column('locationEnabledFlag', 'BOOLEAN', path='owen.context.locationEnabledFlag'),
                Column('loggedInFlag', 'BOOLEAN', path='owen.context.loggedInFlag'),
                Column('notificationEnabledFlag', 'BOOLEAN', path='owen.context.notificationEnabledFlag'),
                Column('personalizationFlag', 'BOOLEAN', path='owen.context.personalizationFlag'),
                Column('advertiserUuid', 'STRING', path='owen.context.advertiserUuid'),
                Column('udid', 'STRING', path='owen.context.udid'),
                Column('userQualifier', 'STRING', path='owen.context.userQualifier'),
                Column('userId', 'STRING', path='owen.context.custom.legacy.userId'),
                Column('userUuid', 'STRING', path='owen.context.userUuid'),
                Column('macAddress', 'STRING', path='owen.context.macAddress'),
                Column('ipAddress', 'STRING', path='owen.context.ipAddress'),
                Column('osVersion', 'STRING', path='owen.context.osVersion'),
                Column('osFamily', 'STRING', path='owen.context.osFamily'),
                Column('osName', 'STRING', path='owen.context.osName'),
                Column('browserFamily', 'STRING', path='owen.context.browserFamily'),
                Column('deviceCategory', 'STRING', path='owen.context.deviceCategory'),
                Column('deviceMake', 'STRING', path='owen.context.mobileDeviceMake'),
                Column('deviceModel', 'STRING', path='owen.context.mobileDeviceModel'),
                Column('connectionType', 'STRING', path='owen.context.connectionType'),
                Column('userAgent', 'STRING', path='owen.context.userAgent'),
                Column('geofenceId', 'STRING', path='owen.context.custom.legacy.geofenceId'),
                Column('eventTimestamp', 'TIMESTAMP', path='owen.event.eventTimestamp', date_pattern="yyyy-MM-dd'T'HH:mm:ssZ"),
                Column('eventInstanceUuid', 'STRING', path='owen.event.eventInstanceUuid'),
                Column('eventPlatformVersion', 'STRING', path='owen.event.eventPlatformVersion'),
                Column('eventVersion', 'STRING', path='owen.event.eventVersion'),
                Column('eventCategory', 'STRING', path='owen.event.eventCategory'),
                Column('eventName', 'STRING', path='owen.event.eventName'),
                Column('eventAction', 'STRING', path='owen.event.eventAction'),
                Column('eventPlatform', 'STRING', path='owen.event.eventPlatform'),
                Column('testUnixTimestampSecondsPattern', 'TIMESTAMP', path='some.fake.path.testUnixTimestampSecondsPattern', date_pattern='UNIX_TIMESTAMP_SECONDS'),
                Column('testUnixTimestampMillisPattern', 'TIMESTAMP', path='some.fake.path.testUnixTimestampMillisPattern', date_pattern='UNIX_TIMESTAMP_MILLIS'),
            ],
            compression='GZIP',
            partitions=[
                Column('year', 'STRING'),
                Column('week', 'STRING'),
                Column('day', 'STRING'),
            ],
        ))

        call('mkdir -p /tmp/dart-emr-test/hive/')
        this_path = os.path.dirname(os.path.abspath(__file__))
        shutil.copyfile(this_path + '/../../../engine/emr/steps/hive/copy_to_table.hql', '/tmp/dart-emr-test/hive/copy_to_table.hql')
        action_id = 'actionid123'

        target_dataset = Dataset.from_dict(ds.to_dict())
        target_dataset.data.data_format.num_header_rows = 0
        target_dataset.data.data_format = DataFormat(FileFormat.RCFILE, RowFormat.NONE)
        stage_dataset = Dataset.from_dict(ds.to_dict())
        assert isinstance(stage_dataset, Dataset)
        for c in stage_dataset.data.columns:
            c.data_type = DataType.STRING

        hive_copy_to_table(stage_dataset, 'owen_eu_stage', target_dataset, 'owen_eu', 's3://test', '/tmp/dart-emr-test/', action_id, None, 1, 1)

        with open(os.path.join(this_path, 'copy_to_table_owen_eu.hql')) as f:
            expected_contents = f.read()

        with open('/tmp/dart-emr-test/hive/copy_to_table_owen_eu.hql') as f:
            actual_contents = f.read()

        self.assertEqual(expected_contents, actual_contents)