Exemple #1
0
    def test_split_job_type(self):
        jtype, stype = edp.split_job_type("MapReduce")
        self.assertEqual(jtype, "MapReduce")
        self.assertEqual(stype, "")

        jtype, stype = edp.split_job_type(MAPRED_STREAMING)
        self.assertEqual(jtype, "MapReduce")
        self.assertEqual(stype, "Streaming")
Exemple #2
0
    def test_split_job_type(self):
        jtype, stype = edp.split_job_type(edp.JOB_TYPE_MAPREDUCE)
        self.assertEqual(edp.JOB_TYPE_MAPREDUCE, jtype)
        self.assertEqual(edp.JOB_SUBTYPE_NONE, stype)

        jtype, stype = edp.split_job_type(edp.JOB_TYPE_MAPREDUCE_STREAMING)
        self.assertEqual(edp.JOB_TYPE_MAPREDUCE, jtype)
        self.assertEqual(edp.JOB_SUBTYPE_STREAMING, stype)
    def test_split_job_type(self):
        jtype, stype = edp.split_job_type(edp.JOB_TYPE_MAPREDUCE)
        self.assertEqual(edp.JOB_TYPE_MAPREDUCE, jtype)
        self.assertEqual(edp.JOB_SUBTYPE_NONE, stype)

        jtype, stype = edp.split_job_type(edp.JOB_TYPE_MAPREDUCE_STREAMING)
        self.assertEqual(edp.JOB_TYPE_MAPREDUCE, jtype)
        self.assertEqual(edp.JOB_SUBTYPE_STREAMING, stype)
Exemple #4
0
def check_mains_libs(data, **kwargs):
    mains = data.get("mains", [])
    libs = data.get("libs", [])
    job_type, subtype = edp.split_job_type(data.get("type"))
    streaming = (job_type == edp.JOB_TYPE_MAPREDUCE and
                 subtype == edp.JOB_SUBTYPE_STREAMING)

    # These types must have a value in mains and may also use libs
    if job_type in [edp.JOB_TYPE_PIG, edp.JOB_TYPE_HIVE,
                    edp.JOB_TYPE_SHELL, edp.JOB_TYPE_SPARK,
                    edp.JOB_TYPE_STORM]:
        if not mains:
            if job_type in [edp.JOB_TYPE_SPARK, edp.JOB_TYPE_STORM]:
                msg = _(
                    "%s job requires main application jar") % data.get("type")
            else:
                msg = _("%s flow requires main script") % data.get("type")
            raise e.InvalidDataException(msg)

        # Check for overlap
        if set(mains).intersection(set(libs)):
            raise e.InvalidDataException(_("'mains' and 'libs' overlap"))

    else:
        # Java and MapReduce require libs, but MapReduce.Streaming does not
        if not streaming and not libs:
            raise e.InvalidDataException(_("%s flow requires libs") %
                                         data.get("type"))
        if mains:
            raise e.InvalidDataException(_("%s flow does not use mains") %
                                         data.get("type"))

    # Make sure that all referenced binaries exist
    _check_binaries(mains)
    _check_binaries(libs)
Exemple #5
0
def check_mains_libs(data, **kwargs):
    mains = data.get("mains", [])
    libs = data.get("libs", [])
    job_type, subtype = edp.split_job_type(data.get("type"))
    streaming = (job_type == edp.JOB_TYPE_MAPREDUCE
                 and subtype == edp.JOB_SUBTYPE_STREAMING)

    # These types must have a value in mains and may also use libs
    if job_type in [edp.JOB_TYPE_PIG, edp.JOB_TYPE_HIVE, edp.JOB_TYPE_SPARK]:
        if not mains:
            if job_type == edp.JOB_TYPE_SPARK:
                msg = _("%s job requires main application jar") % data.get(
                    "type")
            else:
                msg = _("%s flow requires main script") % data.get("type")
            raise e.InvalidDataException(msg)
        # Check for overlap
        if set(mains).intersection(set(libs)):
            raise e.InvalidDataException(_("'mains' and 'libs' overlap"))

    else:
        # Java and MapReduce require libs, but MapReduce.Streaming does not
        if not streaming and not libs:
            raise e.InvalidDataException(
                _("%s flow requires libs") % data.get("type"))
        if mains:
            raise e.InvalidDataException(
                _("%s flow does not use mains") % data.get("type"))

    # Make sure that all referenced binaries exist
    _check_binaries(mains)
    _check_binaries(libs)
Exemple #6
0
def check_job_executor(data, job_id):
    job = api.get_job(job_id)
    job_type, subtype = edp.split_job_type(job.type)

    # Check if cluster contains Oozie service to run job
    main_base.check_edp_job_support(data['cluster_id'])

    # All types except Java require input and output objects
    if job_type == 'Java':
        if not _is_main_class_present(data):
            raise ex.InvalidDataException('Java job must '
                                          'specify edp.java.main_class')
    else:
        if not ('input_id' in data and 'output_id' in data):
            raise ex.InvalidDataException("%s job requires 'input_id' "
                                          "and 'output_id'" % job.type)

        b.check_data_source_exists(data['input_id'])
        b.check_data_source_exists(data['output_id'])

        b.check_data_sources_are_different(data['input_id'], data['output_id'])

        if job_type == 'MapReduce' and (
                subtype == 'Streaming' and not _streaming_present(data)):
            raise ex.InvalidDataException("%s job "
                                          "must specify streaming mapper "
                                          "and reducer" % job.type)

    main_base.check_cluster_exists(data['cluster_id'])
Exemple #7
0
def check_mains_libs(data, **kwargs):
    mains = data.get("mains", [])
    libs = data.get("libs", [])
    job_type, subtype = edp.split_job_type(data.get("type"))
    streaming = job_type == "MapReduce" and subtype == "Streaming"

    # Pig or Hive flow has to contain script in mains, may also use libs
    if job_type in ['Pig', 'Hive']:
        if not mains:
            raise e.InvalidDataException("%s flow requires main script" %
                                         data.get("type"))
        # Check for overlap
        if set(mains).intersection(set(libs)):
            raise e.InvalidDataException("'mains' and 'libs' overlap")

    else:
        if not streaming and not libs:
            raise e.InvalidDataException("%s flow requires libs" %
                                         data.get("type"))
        if mains:
            raise e.InvalidDataException("%s flow does not use mains" %
                                         data.get("type"))

    # Make sure that all referenced binaries exist
    _check_binaries(mains)
    _check_binaries(libs)
Exemple #8
0
def check_mains_libs(data, **kwargs):
    mains = data.get("mains", [])
    libs = data.get("libs", [])
    job_type, subtype = edp.split_job_type(data.get("type"))
    streaming = (job_type == edp.JOB_TYPE_MAPREDUCE and
                 subtype == edp.JOB_SUBTYPE_STREAMING)

    # Pig or Hive flow has to contain script in mains, may also use libs
    if job_type in [edp.JOB_TYPE_PIG, edp.JOB_TYPE_HIVE]:
        if not mains:
            raise e.InvalidDataException("%s flow requires main script" %
                                         data.get("type"))
        # Check for overlap
        if set(mains).intersection(set(libs)):
            raise e.InvalidDataException("'mains' and 'libs' overlap")

    else:
        if not streaming and not libs:
            raise e.InvalidDataException("%s flow requires libs" %
                                         data.get("type"))
        if mains:
            raise e.InvalidDataException("%s flow does not use mains" %
                                         data.get("type"))

    # Make sure that all referenced binaries exist
    _check_binaries(mains)
    _check_binaries(libs)
Exemple #9
0
    def validate_job_execution(self, cluster, job, data):
        # All types except Java require input and output objects
        # and Java require main class
        if job.type in [edp.JOB_TYPE_JAVA]:
            j.check_main_class_present(data, job)
        else:
            j.check_data_sources(data, job)

            job_type, subtype = edp.split_job_type(job.type)
            if job_type == edp.JOB_TYPE_MAPREDUCE and (
                    subtype == edp.JOB_SUBTYPE_STREAMING):
                j.check_streaming_present(data, job)
Exemple #10
0
    def validate_job_execution(self, cluster, job, data):
        # All types except Java require input and output objects
        # and Java require main class
        if job.type in [edp.JOB_TYPE_JAVA]:
            j.check_main_class_present(data, job)
        else:
            j.check_data_sources(data, job)

            job_type, subtype = edp.split_job_type(job.type)
            if job_type == edp.JOB_TYPE_MAPREDUCE and (
                    subtype == edp.JOB_SUBTYPE_STREAMING):
                j.check_streaming_present(data, job)
Exemple #11
0
    def validate_job_execution(self, cluster, job, data):
        # Shell job type requires no specific fields
        if job.type == edp.JOB_TYPE_SHELL:
            return
        # All other types except Java require input and output
        # objects and Java require main class
        if job.type == edp.JOB_TYPE_JAVA:
            j.check_main_class_present(data, job)
        else:
            j.check_data_sources(data, job)

            job_type, subtype = edp.split_job_type(job.type)
            if job_type == edp.JOB_TYPE_MAPREDUCE and (subtype == edp.JOB_SUBTYPE_STREAMING):
                j.check_streaming_present(data, job)
Exemple #12
0
    def validate_job_execution(self, cluster, job, data):
        # Shell job type requires no specific fields
        if job.type == edp.JOB_TYPE_SHELL:
            return
        # All other types except Java require input and output
        # objects and Java require main class
        if job.type == edp.JOB_TYPE_JAVA:
            j.check_main_class_present(data, job)
        else:
            j.check_data_sources(data, job)

            job_type, subtype = edp.split_job_type(job.type)
            if job_type == edp.JOB_TYPE_MAPREDUCE and (
                    subtype == edp.JOB_SUBTYPE_STREAMING):
                j.check_streaming_present(data, job)