Beispiel #1
0
def run_hive_script(emr_engine, datastore, action):
    """
    :type emr_engine: dart.engine.emr.emr.EmrEngine
    :type datastore: dart.model.datastore.Datastore
    :type action: dart.model.action.Action
    """
    dry_run = datastore.data.args['dry_run']
    steps = prepare_run_hive_script_steps(dry_run, datastore, action, action.data.args['script_contents'])

    if dry_run:
        emr_engine.dart.patch_action(action, progress=1, extra_data={'steps': [s.to_dict() for s in steps]})
        return

    run_steps(emr_engine, datastore, action, steps)
    emr_engine.dart.patch_action(action, progress=1)
Beispiel #2
0
def load_dataset(emr_engine, datastore, action):
    """
    :type emr_engine: dart.engine.emr.emr.EmrEngine
    :type datastore: dart.model.datastore.Datastore
    :type action: dart.model.action.Action
    """
    dataset = emr_engine.dart.get_dataset(action.data.args['dataset_id'])
    generator = load_dataset_s3_path_and_file_size_generator(emr_engine, action, dataset)
    dry_run = datastore.data.args['dry_run']
    steps = prepare_load_dataset_steps(dry_run, action.data.args, datastore, dataset, action.id, generator)
    if dry_run:
        emr_engine.dart.patch_action(action, progress=1, extra_data={'steps': [s.to_dict() for s in steps]})
        return

    run_steps(emr_engine, datastore, action, steps)
    emr_engine.dart.patch_action(action, progress=1)
Beispiel #3
0
def consume_subscription(emr_engine,
                         datastore,
                         action,
                         consume_successful=False):
    """
    Having "consume_successful" as parameter (compared to a local variable) is helpful for testing

    :type emr_engine: dart.engine.emr.emr.EmrEngine
    :type datastore: dart.model.datastore.Datastore
    :type action: dart.model.action.Action
    """
    try:
        subscription = emr_engine.dart.get_subscription(
            action.data.args['subscription_id'])
        dataset = emr_engine.dart.get_dataset(subscription.data.dataset_id)
        dry_run = datastore.data.args['dry_run']
        generator = subscription_s3_path_and_file_size_generator(
            emr_engine.dart, action.id)
        steps = prepare_load_dataset_steps(dry_run, action.data.args,
                                           datastore, dataset, action.id,
                                           generator)
        if dry_run:
            consume_successful = True
            emr_engine.dart.patch_action(
                action,
                progress=1,
                extra_data={'steps': [s.to_dict() for s in steps]})
            return

        run_steps(emr_engine, datastore, action, steps)
        consume_successful = True
        emr_engine.dart.patch_action(action, progress=1)

    except Exception as e:
        msg = 'The following error occurred, and the subscription elements have been reverted to UNCONSUMED:\n'\
              + e.message
        if isinstance(e, DartActionException):
            step_wrapper = e.data
            if step_wrapper.action_considered_successful:
                consume_successful = True
        if consume_successful:
            msg = 'Although the following error occurred, the important EMR steps completed successfully and the' \
                  + ' subscription elements have been marked as CONSUMED:\n' + e.message
            raise ActionFailedButConsumeSuccessfulException(msg)

        raise Exception(msg)
Beispiel #4
0
def copy_hdfs_to_s3(emr_engine, datastore, action):
    """
    :type emr_engine: dart.engine.emr.emr.EmrEngine
    :type datastore: dart.model.datastore.Datastore
    :type action: dart.model.action.Action
    """
    src = action.data.args['source_hdfs_path']
    dest = action.data.args['destination_s3_path']
    dry_run = datastore.data.args['dry_run']

    steps = prepare_run_impala_script_steps(dry_run, datastore, action, src, dest)

    if dry_run:
        emr_engine.dart.patch_action(action, progress=1, extra_data={'steps': steps})
        return

    run_steps(emr_engine, datastore, action, steps)
    emr_engine.dart.patch_action(action, progress=1, extra_data={'steps': steps})
Beispiel #5
0
def consume_subscription(emr_engine, datastore, action, consume_successful=False):
    """
    Having "consume_successful" as parameter (compared to a local variable) is helpful for testing

    :type emr_engine: dart.engine.emr.emr.EmrEngine
    :type datastore: dart.model.datastore.Datastore
    :type action: dart.model.action.Action
    """
    try:
        subscription = emr_engine.dart.get_subscription(action.data.args['subscription_id'])
        dataset = emr_engine.dart.get_dataset(subscription.data.dataset_id)
        dry_run = datastore.data.args['dry_run']

        generator_seed = subscription_s3_path_and_file_size_generator(emr_engine.dart, action.id)
        generator, generator2 = itertools.tee(generator_seed)
        steps = prepare_load_dataset_steps(dry_run, action.data.args, datastore, dataset, action.id, generator)

        if dry_run:
            consume_successful = True
            emr_engine.dart.patch_action(action, progress=1, extra_data={
                'steps': steps,
                'first_5_s3_paths_and_file_sizes': list(itertools.islice(generator2, 5)),
            })
            return

        run_steps(emr_engine, datastore, action, steps)
        consume_successful = True
        emr_engine.dart.patch_action(action, progress=1)

    except Exception as e:
        msg = 'The following error occurred, and the subscription elements have been reverted to UNCONSUMED:\n'\
              + e.message
        if isinstance(e, DartActionException):
            step_wrapper = e.data
            if step_wrapper.action_considered_successful:
                consume_successful = True
        if consume_successful:
            msg = 'Although the following error occurred, the important EMR steps completed successfully and the' \
                  + ' subscription elements have been marked as CONSUMED:\n' + e.message
            raise ActionFailedButConsumeSuccessfulException(msg)

        raise Exception(msg)
Beispiel #6
0
def load_dataset(emr_engine, datastore, action):
    """
    :type emr_engine: dart.engine.emr.emr.EmrEngine
    :type datastore: dart.model.datastore.Datastore
    :type action: dart.model.action.Action
    """
    dataset = emr_engine.dart.get_dataset(action.data.args['dataset_id'])
    generator = load_dataset_s3_path_and_file_size_generator(
        emr_engine, action, dataset)
    dry_run = datastore.data.args['dry_run']
    steps = prepare_load_dataset_steps(dry_run, action.data.args, datastore,
                                       dataset, action.id, generator)
    if dry_run:
        emr_engine.dart.patch_action(
            action,
            progress=1,
            extra_data={'steps': [s.to_dict() for s in steps]})
        return

    run_steps(emr_engine, datastore, action, steps)
    emr_engine.dart.patch_action(action, progress=1)
Beispiel #7
0
def load_dataset(emr_engine, datastore, action):
    """
    :type emr_engine: dart.engine.emr.emr.EmrEngine
    :type datastore: dart.model.datastore.Datastore
    :type action: dart.model.action.Action
    """
    dataset = emr_engine.dart.get_dataset(action.data.args['dataset_id'])
    generator_seed = load_dataset_s3_path_and_file_size_generator(emr_engine, action, dataset)
    dry_run = datastore.data.args['dry_run']

    generator, generator2 = itertools.tee(generator_seed)
    steps = prepare_load_dataset_steps(dry_run, action.data.args, datastore, dataset, action.id, generator)

    if dry_run:
        emr_engine.dart.patch_action(action, progress=1, extra_data={
            'steps': steps,
            'first_5_s3_paths_and_file_sizes': list(itertools.islice(generator2, 5)),
        })
        return

    run_steps(emr_engine, datastore, action, steps)
    emr_engine.dart.patch_action(action, progress=1)
Beispiel #8
0
def copy_hdfs_to_s3(emr_engine, datastore, action):
    """
    :type emr_engine: dart.engine.emr.emr.EmrEngine
    :type datastore: dart.model.datastore.Datastore
    :type action: dart.model.action.Action
    """
    src = action.data.args['source_hdfs_path']
    dest = action.data.args['destination_s3_path']
    dry_run = datastore.data.args['dry_run']

    steps = prepare_run_impala_script_steps(dry_run, datastore, action, src,
                                            dest)

    if dry_run:
        emr_engine.dart.patch_action(
            action,
            progress=1,
            extra_data={'steps': [s.to_dict() for s in steps]})
        return

    run_steps(emr_engine, datastore, action, steps)
    emr_engine.dart.patch_action(action, progress=1)