def test_split_job_type(self): jtype, stype = edp.split_job_type("MapReduce") self.assertEqual(jtype, "MapReduce") self.assertEqual(stype, "") jtype, stype = edp.split_job_type(MAPRED_STREAMING) self.assertEqual(jtype, "MapReduce") self.assertEqual(stype, "Streaming")
def test_split_job_type(self): jtype, stype = edp.split_job_type(edp.JOB_TYPE_MAPREDUCE) self.assertEqual(edp.JOB_TYPE_MAPREDUCE, jtype) self.assertEqual(edp.JOB_SUBTYPE_NONE, stype) jtype, stype = edp.split_job_type(edp.JOB_TYPE_MAPREDUCE_STREAMING) self.assertEqual(edp.JOB_TYPE_MAPREDUCE, jtype) self.assertEqual(edp.JOB_SUBTYPE_STREAMING, stype)
def check_mains_libs(data, **kwargs): mains = data.get("mains", []) libs = data.get("libs", []) job_type, subtype = edp.split_job_type(data.get("type")) streaming = (job_type == edp.JOB_TYPE_MAPREDUCE and subtype == edp.JOB_SUBTYPE_STREAMING) # These types must have a value in mains and may also use libs if job_type in [edp.JOB_TYPE_PIG, edp.JOB_TYPE_HIVE, edp.JOB_TYPE_SHELL, edp.JOB_TYPE_SPARK, edp.JOB_TYPE_STORM]: if not mains: if job_type in [edp.JOB_TYPE_SPARK, edp.JOB_TYPE_STORM]: msg = _( "%s job requires main application jar") % data.get("type") else: msg = _("%s flow requires main script") % data.get("type") raise e.InvalidDataException(msg) # Check for overlap if set(mains).intersection(set(libs)): raise e.InvalidDataException(_("'mains' and 'libs' overlap")) else: # Java and MapReduce require libs, but MapReduce.Streaming does not if not streaming and not libs: raise e.InvalidDataException(_("%s flow requires libs") % data.get("type")) if mains: raise e.InvalidDataException(_("%s flow does not use mains") % data.get("type")) # Make sure that all referenced binaries exist _check_binaries(mains) _check_binaries(libs)
def check_mains_libs(data, **kwargs): mains = data.get("mains", []) libs = data.get("libs", []) job_type, subtype = edp.split_job_type(data.get("type")) streaming = (job_type == edp.JOB_TYPE_MAPREDUCE and subtype == edp.JOB_SUBTYPE_STREAMING) # These types must have a value in mains and may also use libs if job_type in [edp.JOB_TYPE_PIG, edp.JOB_TYPE_HIVE, edp.JOB_TYPE_SPARK]: if not mains: if job_type == edp.JOB_TYPE_SPARK: msg = _("%s job requires main application jar") % data.get( "type") else: msg = _("%s flow requires main script") % data.get("type") raise e.InvalidDataException(msg) # Check for overlap if set(mains).intersection(set(libs)): raise e.InvalidDataException(_("'mains' and 'libs' overlap")) else: # Java and MapReduce require libs, but MapReduce.Streaming does not if not streaming and not libs: raise e.InvalidDataException( _("%s flow requires libs") % data.get("type")) if mains: raise e.InvalidDataException( _("%s flow does not use mains") % data.get("type")) # Make sure that all referenced binaries exist _check_binaries(mains) _check_binaries(libs)
def check_job_executor(data, job_id): job = api.get_job(job_id) job_type, subtype = edp.split_job_type(job.type) # Check if cluster contains Oozie service to run job main_base.check_edp_job_support(data['cluster_id']) # All types except Java require input and output objects if job_type == 'Java': if not _is_main_class_present(data): raise ex.InvalidDataException('Java job must ' 'specify edp.java.main_class') else: if not ('input_id' in data and 'output_id' in data): raise ex.InvalidDataException("%s job requires 'input_id' " "and 'output_id'" % job.type) b.check_data_source_exists(data['input_id']) b.check_data_source_exists(data['output_id']) b.check_data_sources_are_different(data['input_id'], data['output_id']) if job_type == 'MapReduce' and ( subtype == 'Streaming' and not _streaming_present(data)): raise ex.InvalidDataException("%s job " "must specify streaming mapper " "and reducer" % job.type) main_base.check_cluster_exists(data['cluster_id'])
def check_mains_libs(data, **kwargs): mains = data.get("mains", []) libs = data.get("libs", []) job_type, subtype = edp.split_job_type(data.get("type")) streaming = job_type == "MapReduce" and subtype == "Streaming" # Pig or Hive flow has to contain script in mains, may also use libs if job_type in ['Pig', 'Hive']: if not mains: raise e.InvalidDataException("%s flow requires main script" % data.get("type")) # Check for overlap if set(mains).intersection(set(libs)): raise e.InvalidDataException("'mains' and 'libs' overlap") else: if not streaming and not libs: raise e.InvalidDataException("%s flow requires libs" % data.get("type")) if mains: raise e.InvalidDataException("%s flow does not use mains" % data.get("type")) # Make sure that all referenced binaries exist _check_binaries(mains) _check_binaries(libs)
def check_mains_libs(data, **kwargs): mains = data.get("mains", []) libs = data.get("libs", []) job_type, subtype = edp.split_job_type(data.get("type")) streaming = (job_type == edp.JOB_TYPE_MAPREDUCE and subtype == edp.JOB_SUBTYPE_STREAMING) # Pig or Hive flow has to contain script in mains, may also use libs if job_type in [edp.JOB_TYPE_PIG, edp.JOB_TYPE_HIVE]: if not mains: raise e.InvalidDataException("%s flow requires main script" % data.get("type")) # Check for overlap if set(mains).intersection(set(libs)): raise e.InvalidDataException("'mains' and 'libs' overlap") else: if not streaming and not libs: raise e.InvalidDataException("%s flow requires libs" % data.get("type")) if mains: raise e.InvalidDataException("%s flow does not use mains" % data.get("type")) # Make sure that all referenced binaries exist _check_binaries(mains) _check_binaries(libs)
def validate_job_execution(self, cluster, job, data): # All types except Java require input and output objects # and Java require main class if job.type in [edp.JOB_TYPE_JAVA]: j.check_main_class_present(data, job) else: j.check_data_sources(data, job) job_type, subtype = edp.split_job_type(job.type) if job_type == edp.JOB_TYPE_MAPREDUCE and ( subtype == edp.JOB_SUBTYPE_STREAMING): j.check_streaming_present(data, job)
def validate_job_execution(self, cluster, job, data): # Shell job type requires no specific fields if job.type == edp.JOB_TYPE_SHELL: return # All other types except Java require input and output # objects and Java require main class if job.type == edp.JOB_TYPE_JAVA: j.check_main_class_present(data, job) else: j.check_data_sources(data, job) job_type, subtype = edp.split_job_type(job.type) if job_type == edp.JOB_TYPE_MAPREDUCE and (subtype == edp.JOB_SUBTYPE_STREAMING): j.check_streaming_present(data, job)
def validate_job_execution(self, cluster, job, data): # Shell job type requires no specific fields if job.type == edp.JOB_TYPE_SHELL: return # All other types except Java require input and output # objects and Java require main class if job.type == edp.JOB_TYPE_JAVA: j.check_main_class_present(data, job) else: j.check_data_sources(data, job) job_type, subtype = edp.split_job_type(job.type) if job_type == edp.JOB_TYPE_MAPREDUCE and ( subtype == edp.JOB_SUBTYPE_STREAMING): j.check_streaming_present(data, job)