Ejemplo n.º 1
0
def prepare_bootstrap_actions(datastore,
                              impala_docker_repo_base_url,
                              impala_version,
                              dry_run=False,
                              action=None):
    current_path, current_file = ntpath.split(os.path.abspath(__file__))
    s3_ba_path = '%s/%s/bootstrap_actions' % (datastore.data.s3_artifacts_path,
                                              datastore.data.engine_name)
    local_ba_path = '%s/../bootstrap_actions/' % current_path

    wrapper_script_path = s3_ba_path + '/shell/install_impala_wrapper.sh'
    install_impala_path = s3_ba_path + '/ruby/install_impala.rb'
    copy_resources_path = s3_ba_path + '/shell/copy_resources.sh'
    csv_serde_path = s3_ba_path + '/misc/csv-serde-1.1.2-0.11.0-all.jar'
    results = [
        ('dart: install impala (background)', wrapper_script_path,
         install_impala_path, impala_docker_repo_base_url, impala_version),
        ('dart: copy resources', copy_resources_path, csv_serde_path)
    ]
    if action and action.data.args.get('bootstrap_script'):
        script_path = os.path.join(local_ba_path, 'misc',
                                   'bootstrap_script_%s.txt' % action.id)
        with open(script_path, 'w') as f:
            f.write(action.data.args['bootstrap_script'])
        results.append(
            ('dart: custom bootstrap_script',
             s3_ba_path + '/misc/bootstrap_script_%s.txt' % action.id))

    if not dry_run:
        s3_copy_recursive(local_ba_path, s3_ba_path)

    return results
Ejemplo n.º 2
0
def prepare_run_impala_script_steps(dry_run, datastore, action, src, dest):
    # some steps require producing a dataset specific file based on a template, so we will copy all to a tempdir
    tempdir = tempfile.mkdtemp()
    try:
        local_step_path, s3_step_path, s3_temp_path = prepare_step_paths(datastore, tempdir)
        steps = [python_copy_hdfs_to_s3(s3_step_path, src, dest, s3_temp_path, action.id, 1, 1)]

        if not dry_run:
            s3_copy_recursive(local_step_path, s3_step_path)

        return steps

    finally:
        shutil.rmtree(tempdir)
Ejemplo n.º 3
0
def prepare_run_hive_script_steps(dry_run, datastore, action, script_contents):
    # some steps require producing a dataset specific file based on a template, so we will copy all to a tempdir
    tempdir = tempfile.mkdtemp()
    try:
        local_step_path, s3_step_path, s3_temp_path = prepare_step_paths(datastore, tempdir)
        steps = [hive_run_script_contents_step(script_contents, s3_step_path, local_step_path, action.id, 1, 1)]

        if not dry_run:
            s3_copy_recursive(local_step_path, s3_step_path)

        return steps

    finally:
        shutil.rmtree(tempdir)
Ejemplo n.º 4
0
def prepare_bootstrap_actions(datastore, impala_docker_repo_base_url, impala_version, dry_run=False):
    current_path, current_file = ntpath.split(os.path.abspath(__file__))
    s3_ba_path = '%s/%s/bootstrap_actions' % (datastore.data.s3_artifacts_path, datastore.data.engine_name)
    local_ba_path = '%s/../bootstrap_actions/' % current_path
    if not dry_run:
        s3_copy_recursive(local_ba_path, s3_ba_path)

    wrapper_script_path = s3_ba_path + '/shell/install_impala_wrapper.sh'
    install_impala_path = s3_ba_path + '/ruby/install_impala.rb'
    copy_resources_path = s3_ba_path + '/shell/copy_resources.sh'
    csv_serde_path = s3_ba_path + '/misc/csv-serde-1.1.2-0.11.0-all.jar'
    return [
        ('dart: install impala (background)', wrapper_script_path, install_impala_path, impala_docker_repo_base_url,
         impala_version),
        ('dart: copy resources', copy_resources_path, csv_serde_path),
    ]
Ejemplo n.º 5
0
def prepare_run_impala_script_steps(dry_run, datastore, action, src, dest):
    # some steps require producing a dataset specific file based on a template, so we will copy all to a tempdir
    tempdir = tempfile.mkdtemp()
    try:
        local_step_path, s3_step_path, s3_temp_path = prepare_step_paths(
            datastore, tempdir)
        steps = [
            python_copy_hdfs_to_s3(s3_step_path, src, dest, s3_temp_path,
                                   action.id, 1, 1)
        ]

        if not dry_run:
            s3_copy_recursive(local_step_path, s3_step_path)

        return steps

    finally:
        shutil.rmtree(tempdir)
Ejemplo n.º 6
0
def prepare_bootstrap_actions(datastore,
                              impala_docker_repo_base_url,
                              impala_version,
                              dry_run=False):
    current_path, current_file = ntpath.split(os.path.abspath(__file__))
    s3_ba_path = '%s/%s/bootstrap_actions' % (datastore.data.s3_artifacts_path,
                                              datastore.data.engine_name)
    local_ba_path = '%s/../bootstrap_actions/' % current_path
    if not dry_run:
        s3_copy_recursive(local_ba_path, s3_ba_path)

    wrapper_script_path = s3_ba_path + '/shell/install_impala_wrapper.sh'
    install_impala_path = s3_ba_path + '/ruby/install_impala.rb'
    copy_resources_path = s3_ba_path + '/shell/copy_resources.sh'
    csv_serde_path = s3_ba_path + '/misc/csv-serde-1.1.2-0.11.0-all.jar'
    return [
        ('dart: install impala (background)', wrapper_script_path,
         install_impala_path, impala_docker_repo_base_url, impala_version),
        ('dart: copy resources', copy_resources_path, csv_serde_path),
    ]
Ejemplo n.º 7
0
def prepare_bootstrap_actions(datastore, impala_docker_repo_base_url, impala_version, dry_run=False, action=None):
    current_path, current_file = ntpath.split(os.path.abspath(__file__))
    s3_ba_path = '%s/%s/bootstrap_actions' % (datastore.data.s3_artifacts_path, datastore.data.engine_name)
    local_ba_path = '%s/../bootstrap_actions/' % current_path

    wrapper_script_path = s3_ba_path + '/shell/install_impala_wrapper.sh'
    install_impala_path = s3_ba_path + '/ruby/install_impala.rb'
    copy_resources_path = s3_ba_path + '/shell/copy_resources.sh'
    csv_serde_path = s3_ba_path + '/misc/csv-serde-1.1.2-0.11.0-all.jar'
    results = [
        ('dart: install impala (background)', wrapper_script_path, install_impala_path, impala_docker_repo_base_url, impala_version),
        ('dart: copy resources', copy_resources_path, csv_serde_path)
    ]
    if action and action.data.args.get('bootstrap_script'):
        script_path = os.path.join(local_ba_path, 'misc', 'bootstrap_script_%s.txt' % action.id)
        with open(script_path, 'w') as f:
            f.write(action.data.args['bootstrap_script'])
        results.append(('dart: custom bootstrap_script', s3_ba_path + '/misc/bootstrap_script_%s.txt' % action.id))

    if not dry_run:
        s3_copy_recursive(local_ba_path, s3_ba_path)

    return results
Ejemplo n.º 8
0
def prepare_load_dataset_steps(dry_run,
                               args_by_name,
                               datastore,
                               dataset,
                               action_id,
                               s3_path_and_file_size_gen,
                               target_is_dynamodb=False):
    """ :type dataset: dart.model.dataset.Dataset """
    def add_to(step_partials, step_num, func, *args):
        # add all params except the last one, which is the total steps (known at the end)
        step_partials.append(
            functools.partial(func, *(list(args) + [step_num])))
        return step_num + 1

    def stage_table_not_needed(ds, file_format, row_format, compression,
                               delimited_by, quoted_by, escaped_by,
                               null_string):
        """ :type ds: dart.model.dataset.Dataset """
        return file_format == ds.data.data_format.file_format\
            and row_format == ds.data.data_format.row_format\
            and compression == ds.data.compression\
            and delimited_by == ds.data.data_format.delimited_by\
            and quoted_by == ds.data.data_format.quoted_by\
            and escaped_by == ds.data.data_format.escaped_by\
            and null_string == ds.data.data_format.null_string

    # some steps require producing a dataset specific file based on a template, so we will copy all to a tempdir
    tempdir = tempfile.mkdtemp()
    try:
        local_step_path, s3_step_path, s3_temp_path = prepare_step_paths(
            datastore, tempdir)
        target_table_name = args_by_name.get(
            'target_table_name') or dataset.data.table_name
        target_file_format = args_by_name.get(
            'target_file_format') or dataset.data.data_format.file_format
        target_row_format = args_by_name.get(
            'target_row_format') or dataset.data.data_format.row_format
        target_compression = args_by_name.get(
            'target_compression') or dataset.data.compression
        target_delimited_by = args_by_name.get(
            'target_delimited_by') or dataset.data.data_format.delimited_by
        target_quoted_by = args_by_name.get(
            'target_quoted_by') or dataset.data.data_format.quoted_by
        target_escaped_by = args_by_name.get(
            'target_escaped_by') or dataset.data.data_format.escaped_by
        target_null_string = args_by_name.get(
            'target_null_string') or dataset.data.data_format.null_string

        stage_table_name = target_table_name + '_stage_for_action_' + action_id
        staging_not_needed = stage_table_not_needed(
            dataset, target_file_format, target_row_format, target_compression,
            target_delimited_by, target_quoted_by, target_escaped_by,
            target_null_string)
        first_table_name = target_table_name if staging_not_needed and not target_is_dynamodb else stage_table_name

        drop_table_names = []
        step_funcs = []
        i = 1

        # ------------------------------------------------------------------------------------------------------------
        # all code paths below require copying the data to HDFS, and lowercasing the table is required because of hive
        # ------------------------------------------------------------------------------------------------------------
        i = add_to(step_funcs, i,
                   s3distcp_files_step, s3_path_and_file_size_gen,
                   first_table_name.lower(), dataset, s3_step_path,
                   local_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # not all folder structures on s3 are hive compatible... if not, rename directories after copying
        # ------------------------------------------------------------------------------------------------------------
        if dataset.data.partitions and not dataset.data.hive_compatible_partition_folders:
            i = add_to(step_funcs, i, python_fix_partition_folder_names,
                       first_table_name.lower(), dataset.data.partitions,
                       s3_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # special case to share functionality with the dynamodb_engine
        # ------------------------------------------------------------------------------------------------------------
        if target_is_dynamodb:
            dyn_dataset = Dataset.from_dict(dataset.to_dict())
            assert isinstance(dyn_dataset, Dataset)
            dyn_dataset.data.data_format = DataFormat('DYNAMODB_TABLE',
                                                      RowFormat.NONE, 0)
            dyn_dataset.data.compression = Compression.NONE
            dyn_dataset.data.columns = [
                Column(c.name, dynamodb_column_type(c))
                for c in dataset.data.columns
            ]
            set_hive_vars = 'SET dynamodb.retry.duration = 0;\nSET dynamodb.throughput.write.percent = %s;'
            set_hive_vars = set_hive_vars % args_by_name[
                'write_capacity_utilization_percent']

            i = add_to(step_funcs, i, hive_table_definition_step,
                       stage_table_name, dataset, s3_step_path,
                       local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_table_definition_step,
                       target_table_name, dyn_dataset, s3_step_path,
                       local_step_path, action_id, True)
            i = add_to(step_funcs, i, hive_msck_repair_table_step,
                       stage_table_name, s3_step_path, action_id)
            i = add_to(step_funcs, i, hive_copy_to_table, dataset,
                       stage_table_name, dyn_dataset, target_table_name,
                       s3_step_path, local_step_path, action_id, set_hive_vars)

        # ------------------------------------------------------------------------------------------------------------
        # if no stage tables are needed, much complexity can be skipped
        # ------------------------------------------------------------------------------------------------------------
        elif staging_not_needed:
            i = add_to(step_funcs, i, hive_table_definition_step,
                       target_table_name, dataset, s3_step_path,
                       local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_msck_repair_table_step,
                       target_table_name, s3_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # one or more staging tables are needed
        # ------------------------------------------------------------------------------------------------------------
        else:
            stage_dataset = dataset
            target_dataset = Dataset.from_dict(dataset.to_dict())
            target_dataset.data.data_format = DataFormat(
                target_file_format, target_row_format, 0, target_delimited_by,
                target_quoted_by, target_escaped_by, target_null_string)
            target_dataset.data.compression = target_compression
            drop_table_names.append(stage_table_name)

            # --------------------------------------------------------------------------------------------------------
            # define string types for JSON/REGEX based datasets (safe), and we will cast appropriately during insert
            # --------------------------------------------------------------------------------------------------------
            if stage_dataset.data.data_format.row_format in [
                    RowFormat.JSON, RowFormat.REGEX
            ]:
                # make a copy since we are modifying the columns
                stage_dataset = Dataset.from_dict(dataset.to_dict())
                assert isinstance(stage_dataset, Dataset)
                for c in stage_dataset.data.columns:
                    c.data_type = DataType.STRING

            i = add_to(step_funcs, i, hive_table_definition_step,
                       stage_table_name, stage_dataset, s3_step_path,
                       local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_table_definition_step,
                       target_table_name, target_dataset, s3_step_path,
                       local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_msck_repair_table_step,
                       stage_table_name, s3_step_path, action_id)

            # --------------------------------------------------------------------------------------------------------
            # hive has issues creating parquet files
            # --------------------------------------------------------------------------------------------------------
            if target_file_format != FileFormat.PARQUET:
                i = add_to(step_funcs, i, hive_copy_to_table, stage_dataset,
                           stage_table_name, target_dataset, target_table_name,
                           s3_step_path, local_step_path, action_id, None)

            # --------------------------------------------------------------------------------------------------------
            # impala is better for creating parquet files
            # --------------------------------------------------------------------------------------------------------
            else:
                # ----------------------------------------------------------------------------------------------------
                # no additional staging tables needed if the source dataset file format is RCFILE (impala friendly)
                # ----------------------------------------------------------------------------------------------------
                if dataset.data.data_format.file_format == FileFormat.RCFILE:
                    i = add_to(step_funcs, i, hive_copy_to_table,
                               stage_dataset, stage_table_name, target_dataset,
                               target_table_name, s3_step_path,
                               local_step_path, action_id, None)

                # ----------------------------------------------------------------------------------------------------
                # impala cannot read all hive formats, so we will introduce another staging table
                # ----------------------------------------------------------------------------------------------------
                else:
                    rc_table_name = target_table_name + '_rcfile_stage_for_action_' + action_id
                    rc_dataset = Dataset.from_dict(target_dataset.to_dict())
                    rc_dataset.data.data_format = DataFormat(
                        FileFormat.RCFILE, RowFormat.NONE, 0)
                    rc_dataset.data.compression = Compression.NONE
                    drop_table_names.append(rc_table_name)

                    i = add_to(step_funcs, i, hive_table_definition_step,
                               rc_table_name, rc_dataset, s3_step_path,
                               local_step_path, action_id, False)
                    i = add_to(step_funcs, i, hive_copy_to_table,
                               stage_dataset, stage_table_name, rc_dataset,
                               rc_table_name, s3_step_path, local_step_path,
                               action_id, None)
                    i = add_to(step_funcs, i, impala_copy_to_table, rc_dataset,
                               rc_table_name, target_dataset,
                               target_table_name, s3_step_path,
                               local_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # at this point, the load should be considered complete even if something goes wrong in the steps below,
        # so we will indicate this in the step wrapper
        # ------------------------------------------------------------------------------------------------------------
        considered_successful_at_this_index = i

        # ------------------------------------------------------------------------------------------------------------
        # drop any staging tables created
        # ------------------------------------------------------------------------------------------------------------
        if drop_table_names:
            script = '\n'.join(
                ['DROP TABLE %s;' % name for name in drop_table_names])
            i = add_to(step_funcs, i, hive_run_script_contents_step, script,
                       s3_step_path, local_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # inform impala about changes
        # ------------------------------------------------------------------------------------------------------------
        if not target_is_dynamodb:
            i = add_to(step_funcs, i, impala_invalidate_metadata_step,
                       s3_step_path, action_id)

        total_steps = i - 1
        steps = []
        for index, f in enumerate(step_funcs, 1):
            step_wrapper = f(total_steps)
            assert isinstance(step_wrapper, StepWrapper)
            if index >= considered_successful_at_this_index:
                step_wrapper.action_considered_successful = True
            steps.append(step_wrapper)

        if not dry_run:
            s3_copy_recursive(local_step_path, s3_step_path)

        return steps

    finally:
        shutil.rmtree(tempdir)
Ejemplo n.º 9
0
def prepare_load_dataset_steps(dry_run, args_by_name, datastore, dataset, action_id, s3_path_and_file_size_gen,
                               target_is_dynamodb=False):
    """ :type dataset: dart.model.dataset.Dataset """

    def add_to(step_partials, step_num, func, *args):
        # add all params except the last one, which is the total steps (known at the end)
        step_partials.append(functools.partial(func, *(list(args) + [step_num])))
        return step_num + 1

    def stage_table_not_needed(ds, file_format, row_format, compression, delimited_by, quoted_by,
                               escaped_by, null_string):
        """ :type ds: dart.model.dataset.Dataset """
        return file_format == ds.data.data_format.file_format\
            and row_format == ds.data.data_format.row_format\
            and compression == ds.data.compression\
            and delimited_by == ds.data.data_format.delimited_by\
            and quoted_by == ds.data.data_format.quoted_by\
            and escaped_by == ds.data.data_format.escaped_by\
            and null_string == ds.data.data_format.null_string

    # some steps require producing a dataset specific file based on a template, so we will copy all to a tempdir
    tempdir = tempfile.mkdtemp()
    try:
        local_step_path, s3_step_path, s3_temp_path = prepare_step_paths(datastore, tempdir)
        target_table_name = args_by_name.get('target_table_name') or dataset.data.table_name
        target_file_format = args_by_name.get('target_file_format') or dataset.data.data_format.file_format
        target_row_format = args_by_name.get('target_row_format') or dataset.data.data_format.row_format
        target_compression = args_by_name.get('target_compression') or dataset.data.compression
        target_delimited_by = args_by_name.get('target_delimited_by') or dataset.data.data_format.delimited_by
        target_quoted_by = args_by_name.get('target_quoted_by') or dataset.data.data_format.quoted_by
        target_escaped_by = args_by_name.get('target_escaped_by') or dataset.data.data_format.escaped_by
        target_null_string = args_by_name.get('target_null_string') or dataset.data.data_format.null_string

        stage_table_name = target_table_name + '_stage_for_action_' + action_id
        staging_not_needed = stage_table_not_needed(dataset, target_file_format, target_row_format, target_compression,
                                                    target_delimited_by, target_quoted_by, target_escaped_by, target_null_string)
        first_table_name = target_table_name if staging_not_needed and not target_is_dynamodb else stage_table_name

        drop_table_names = []
        step_funcs = []
        i = 1

        # ------------------------------------------------------------------------------------------------------------
        # all code paths below require copying the data to HDFS, and lowercasing the table is required because of hive
        # ------------------------------------------------------------------------------------------------------------
        i = add_to(step_funcs, i, s3distcp_files_step, s3_path_and_file_size_gen, first_table_name.lower(), dataset, s3_step_path, local_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # not all folder structures on s3 are hive compatible... if not, rename directories after copying
        # ------------------------------------------------------------------------------------------------------------
        if dataset.data.partitions and not dataset.data.hive_compatible_partition_folders:
            i = add_to(step_funcs, i, python_fix_partition_folder_names, first_table_name.lower(), dataset.data.partitions, s3_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # special case to share functionality with the dynamodb_engine
        # ------------------------------------------------------------------------------------------------------------
        if target_is_dynamodb:
            dyn_dataset = Dataset.from_dict(dataset.to_dict())
            assert isinstance(dyn_dataset, Dataset)
            dyn_dataset.data.data_format = DataFormat('DYNAMODB_TABLE', RowFormat.NONE, 0)
            dyn_dataset.data.compression = Compression.NONE
            dyn_dataset.data.columns = [Column(c.name, dynamodb_column_type(c)) for c in dataset.data.columns]
            set_hive_vars = 'SET dynamodb.retry.duration = 0;\nSET dynamodb.throughput.write.percent = %s;'
            set_hive_vars = set_hive_vars % args_by_name['write_capacity_utilization_percent']

            i = add_to(step_funcs, i, hive_table_definition_step, stage_table_name, dataset, s3_step_path, local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_table_definition_step, target_table_name, dyn_dataset, s3_step_path, local_step_path, action_id, True)
            i = add_to(step_funcs, i, hive_msck_repair_table_step, stage_table_name, s3_step_path, action_id)
            i = add_to(step_funcs, i, hive_copy_to_table, dataset, stage_table_name, dyn_dataset, target_table_name, s3_step_path, local_step_path, action_id, set_hive_vars)

        # ------------------------------------------------------------------------------------------------------------
        # if no stage tables are needed, much complexity can be skipped
        # ------------------------------------------------------------------------------------------------------------
        elif staging_not_needed:
            i = add_to(step_funcs, i, hive_table_definition_step, target_table_name, dataset, s3_step_path, local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_msck_repair_table_step, target_table_name, s3_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # one or more staging tables are needed
        # ------------------------------------------------------------------------------------------------------------
        else:
            stage_dataset = dataset
            target_dataset = Dataset.from_dict(dataset.to_dict())
            target_dataset.data.data_format = DataFormat(target_file_format, target_row_format, 0, target_delimited_by,
                                                         target_quoted_by, target_escaped_by, target_null_string)
            target_dataset.data.compression = target_compression
            drop_table_names.append(stage_table_name)

            # --------------------------------------------------------------------------------------------------------
            # define string types for JSON/REGEX based datasets (safe), and we will cast appropriately during insert
            # --------------------------------------------------------------------------------------------------------
            if stage_dataset.data.data_format.row_format in [RowFormat.JSON, RowFormat.REGEX]:
                # make a copy since we are modifying the columns
                stage_dataset = Dataset.from_dict(dataset.to_dict())
                assert isinstance(stage_dataset, Dataset)
                for c in stage_dataset.data.columns:
                    c.data_type = DataType.STRING

            i = add_to(step_funcs, i, hive_table_definition_step, stage_table_name, stage_dataset, s3_step_path, local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_table_definition_step, target_table_name, target_dataset, s3_step_path, local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_msck_repair_table_step, stage_table_name, s3_step_path, action_id)

            # --------------------------------------------------------------------------------------------------------
            # hive has issues creating parquet files
            # --------------------------------------------------------------------------------------------------------
            if target_file_format != FileFormat.PARQUET:
                i = add_to(step_funcs, i, hive_copy_to_table, stage_dataset, stage_table_name, target_dataset, target_table_name, s3_step_path, local_step_path, action_id, None)

            # --------------------------------------------------------------------------------------------------------
            # impala is better for creating parquet files
            # --------------------------------------------------------------------------------------------------------
            else:
                # ----------------------------------------------------------------------------------------------------
                # no additional staging tables needed if the source dataset file format is RCFILE (impala friendly)
                # ----------------------------------------------------------------------------------------------------
                if dataset.data.data_format.file_format == FileFormat.RCFILE:
                    i = add_to(step_funcs, i, hive_copy_to_table, stage_dataset, stage_table_name, target_dataset, target_table_name, s3_step_path, local_step_path, action_id, None)

                # ----------------------------------------------------------------------------------------------------
                # impala cannot read all hive formats, so we will introduce another staging table
                # ----------------------------------------------------------------------------------------------------
                else:
                    rc_table_name = target_table_name + '_rcfile_stage_for_action_' + action_id
                    rc_dataset = Dataset.from_dict(target_dataset.to_dict())
                    rc_dataset.data.data_format = DataFormat(FileFormat.RCFILE, RowFormat.NONE, 0)
                    rc_dataset.data.compression = Compression.NONE
                    drop_table_names.append(rc_table_name)

                    i = add_to(step_funcs, i, hive_table_definition_step, rc_table_name, rc_dataset, s3_step_path, local_step_path, action_id, False)
                    i = add_to(step_funcs, i, hive_copy_to_table, stage_dataset, stage_table_name, rc_dataset, rc_table_name, s3_step_path, local_step_path, action_id, None)
                    i = add_to(step_funcs, i, impala_copy_to_table, rc_dataset, rc_table_name, target_dataset, target_table_name, s3_step_path, local_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # at this point, the load should be considered complete even if something goes wrong in the steps below,
        # so we will indicate this in the step wrapper
        # ------------------------------------------------------------------------------------------------------------
        considered_successful_at_this_index = i

        # ------------------------------------------------------------------------------------------------------------
        # drop any staging tables created
        # ------------------------------------------------------------------------------------------------------------
        if drop_table_names:
            script = '\n'.join(['DROP TABLE %s;' % name for name in drop_table_names])
            i = add_to(step_funcs, i, hive_run_script_contents_step, script, s3_step_path, local_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # inform impala about changes
        # ------------------------------------------------------------------------------------------------------------
        if not target_is_dynamodb:
            i = add_to(step_funcs, i, impala_invalidate_metadata_step, s3_step_path, action_id)

        total_steps = i - 1
        steps = []
        for index, f in enumerate(step_funcs, 1):
            step_wrapper = f(total_steps)
            assert isinstance(step_wrapper, StepWrapper)
            if index >= considered_successful_at_this_index:
                step_wrapper.action_considered_successful = True
            steps.append(step_wrapper)

        if not dry_run:
            s3_copy_recursive(local_step_path, s3_step_path)

        return steps

    finally:
        shutil.rmtree(tempdir)