def _create_job_binary(id, type): binary = mock.Mock() binary.id = id binary.url = "internal-db://42" if edp.compare_job_type(type, edp.JOB_TYPE_PIG): binary.name = "script.pig" elif edp.compare_job_type(type, edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_JAVA): binary.name = "main.jar" else: binary.name = "script.q" return binary
def _create_job_binary(id, type): binary = mock.Mock() binary.id = id binary.url = "internal-db://42" if edp.compare_job_type(type, 'Pig'): binary.name = "script.pig" elif edp.compare_job_type(type, 'MapReduce', 'Java'): binary.name = "main.jar" else: binary.name = "script.q" return binary
def create_job_binary(id, type): binary = mock.Mock() binary.id = id binary.url = "internal-db://42" if edp.compare_job_type(type, edp.JOB_TYPE_PIG): binary.name = "script.pig" elif edp.compare_job_type(type, edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_JAVA): binary.name = "main.jar" else: binary.name = "script.q" return binary
def test_compare_job_type(self): self.assertTrue(edp.compare_job_type("Java", "Java", "MapReduce", strict=True)) self.assertFalse(edp.compare_job_type(MAPRED_STREAMING, "Java", "MapReduce", strict=True)) self.assertTrue(edp.compare_job_type(MAPRED_STREAMING, "Java", "MapReduce")) self.assertFalse(edp.compare_job_type("MapReduce", "Java", MAPRED_STREAMING))
def get_possible_job_config(job_type): if edp.compare_job_type(job_type, edp.JOB_TYPE_HIVE): return {'job_config': ch_helper.get_possible_hive_config_from( 'plugins/vanilla/v2_6_0/resources/hive-default.xml')} if edp.compare_job_type(job_type, edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_MAPREDUCE_STREAMING): return {'job_config': ch_helper.get_possible_mapreduce_config_from( 'plugins/vanilla/v2_6_0/resources/mapred-default.xml')} if edp.compare_job_type(job_type, edp.JOB_TYPE_PIG): return {'job_config': ch_helper.get_possible_pig_config_from( 'plugins/vanilla/v2_6_0/resources/mapred-default.xml')} return edp_engine.EdpOozieEngine.get_possible_job_config(job_type)
def get_possible_job_config(job_type): if edp.compare_job_type(job_type, edp.JOB_TYPE_HIVE): return {'job_config': ch_helper.get_possible_hive_config_from( 'plugins/cdh/v5_4_0/resources/hive-site.xml')} if edp.compare_job_type(job_type, edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_MAPREDUCE_STREAMING): return {'job_config': ch_helper.get_possible_mapreduce_config_from( 'plugins/cdh/v5_4_0/resources/mapred-site.xml')} if edp.compare_job_type(job_type, edp.JOB_TYPE_PIG): return {'job_config': ch_helper.get_possible_pig_config_from( 'plugins/cdh/v5_4_0/resources/mapred-site.xml')} return edp_engine.OozieJobEngine.get_possible_job_config(job_type)
def get_possible_job_config(job_type): if not edp.compare_job_type(job_type, *edp.JOB_TYPES_ALL): return None if edp.compare_job_type(job_type, edp.JOB_TYPE_JAVA): return {'job_config': {'configs': [], 'args': []}} if edp.compare_job_type(job_type, edp.JOB_TYPE_SHELL): return {'job_config': {'configs': [], 'params': {}, 'args': []}} if edp.compare_job_type(job_type, edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_PIG): # TODO(nmakhotkin): Here we need return config based on specific plugin cfg = xmlutils.load_hadoop_xml_defaults( 'plugins/vanilla/v1_2_1/resources/mapred-default.xml') if edp.compare_job_type(job_type, edp.JOB_TYPE_MAPREDUCE): cfg += get_possible_mapreduce_configs() elif edp.compare_job_type(job_type, edp.JOB_TYPE_HIVE): # TODO(nmakhotkin): Here we need return config based on specific plugin cfg = xmlutils.load_hadoop_xml_defaults( 'plugins/vanilla/v1_2_1/resources/hive-default.xml') config = {'configs': cfg} if edp.compare_job_type(job_type, edp.JOB_TYPE_PIG, edp.JOB_TYPE_HIVE): config.update({'params': {}}) if edp.compare_job_type(job_type, edp.JOB_TYPE_PIG): config.update({'args': []}) return {'job_config': config}
def get_possible_job_config(job_type): if not edp.compare_job_type(job_type, *edp.JOB_TYPES_ALL): return None if edp.compare_job_type(job_type, edp.JOB_TYPE_JAVA): return {'job_config': {'configs': [], 'args': []}} if edp.compare_job_type(job_type, edp.JOB_TYPE_SHELL): return {'job_config': {'configs': [], 'params': {}, 'args': []}} if edp.compare_job_type(job_type, edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_PIG): cfg = xmlutils.load_hadoop_xml_defaults( 'service/edp/resources/mapred-default.xml') if edp.compare_job_type(job_type, edp.JOB_TYPE_MAPREDUCE): cfg += get_possible_mapreduce_configs() elif edp.compare_job_type(job_type, edp.JOB_TYPE_HIVE): cfg = xmlutils.load_hadoop_xml_defaults( 'service/edp/resources/hive-default.xml') config = {'configs': cfg} if edp.compare_job_type(job_type, edp.JOB_TYPE_PIG, edp.JOB_TYPE_HIVE): config.update({'params': {}}) if edp.compare_job_type(job_type, edp.JOB_TYPE_PIG): config.update({'args': []}) return {'job_config': config}
def get_possible_job_config(job_type): if edp.compare_job_type(job_type, edp.JOB_TYPE_HIVE): return {'job_config': ch_helper.get_possible_hive_config_from( 'plugins/hdp/versions/version_1_3_2/resources/' 'ambari-config-resource.json')} if edp.compare_job_type(job_type, edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_MAPREDUCE_STREAMING): return {'job_config': ch_helper.get_possible_mapreduce_config_from( 'plugins/hdp/versions/version_1_3_2/resources/' 'ambari-config-resource.json')} if edp.compare_job_type(job_type, edp.JOB_TYPE_PIG): return {'job_config': ch_helper.get_possible_pig_config_from( 'plugins/hdp/versions/version_1_3_2/resources/' 'ambari-config-resource.json')} return edp_engine.EdpOozieEngine.get_possible_job_config(job_type)
def get_data_sources(job_execution, job): if edp.compare_job_type(job.type, edp.JOB_TYPE_JAVA): return None, None ctx = context.ctx() input_source = conductor.data_source_get(ctx, job_execution.input_id) output_source = conductor.data_source_get(ctx, job_execution.output_id) return input_source, output_source
def _create_job_exec(job_id, type, configs=None): j_exec = mock.Mock() j_exec.job_id = job_id j_exec.job_configs = configs if edp.compare_job_type(type, edp.JOB_TYPE_JAVA): j_exec.job_configs['configs']['edp.java.main_class'] = _java_main_class j_exec.job_configs['configs']['edp.java.java_opts'] = _java_opts return j_exec
def test_compare_job_type(self): self.assertTrue( edp.compare_job_type(edp.JOB_TYPE_JAVA, edp.JOB_TYPE_JAVA, edp.JOB_TYPE_MAPREDUCE, strict=True)) self.assertFalse( edp.compare_job_type(edp.JOB_TYPE_MAPREDUCE_STREAMING, edp.JOB_TYPE_JAVA, edp.JOB_TYPE_MAPREDUCE, strict=True)) self.assertTrue( edp.compare_job_type(edp.JOB_TYPE_MAPREDUCE_STREAMING, edp.JOB_TYPE_JAVA, edp.JOB_TYPE_MAPREDUCE)) self.assertFalse( edp.compare_job_type(edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_JAVA, edp.JOB_TYPE_MAPREDUCE_STREAMING))
def get_data_sources(job_execution, job): if edp.compare_job_type(job.type, edp.JOB_TYPE_JAVA, edp.JOB_TYPE_SPARK): return None, None ctx = context.ctx() input_source = conductor.data_source_get(ctx, job_execution.input_id) output_source = conductor.data_source_get(ctx, job_execution.output_id) return input_source, output_source
def get_possible_job_config(job_type): if not edp.compare_job_type(job_type, *edp.JOB_TYPES_ALL): return None if edp.compare_job_type(job_type, edp.JOB_TYPE_JAVA): return {'job_config': {'configs': [], 'args': []}} if edp.compare_job_type(job_type, edp.JOB_TYPE_SHELL): return {'job_config': {'configs': [], 'params': [], 'args': []}} if edp.compare_job_type(job_type, edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_PIG): # TODO(nmakhotkin): Here we need return config based on specific plugin cfg = xmlutils.load_hadoop_xml_defaults( 'plugins/vanilla/v1_2_1/resources/mapred-default.xml') if edp.compare_job_type(job_type, edp.JOB_TYPE_MAPREDUCE): cfg += xmlutils.load_hadoop_xml_defaults( 'service/edp/resources/mapred-job-config.xml') elif edp.compare_job_type(job_type, edp.JOB_TYPE_HIVE): # TODO(nmakhotkin): Here we need return config based on specific plugin cfg = xmlutils.load_hadoop_xml_defaults( 'plugins/vanilla/v1_2_1/resources/hive-default.xml') # TODO(tmckay): args should be a list when bug #269968 # is fixed on the UI side config = {'configs': cfg, "args": {}} if not edp.compare_job_type(edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_JAVA): config.update({'params': {}}) return {'job_config': config}
def run_job(job_execution): ctx = context.ctx() cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster.status != 'Active': return job_execution job = conductor.job_get(ctx, job_execution.job_id) if not edp.compare_job_type(job.type, edp.JOB_TYPE_JAVA): input_source = conductor.data_source_get(ctx, job_execution.input_id) output_source = conductor.data_source_get(ctx, job_execution.output_id) else: input_source = None output_source = None #TODO(nprivalova): should be removed after all features implemented validate(input_source, output_source, job) for data_source in [input_source, output_source]: if data_source and data_source.type == 'hdfs': h.configure_cluster_for_hdfs(cluster, data_source) hdfs_user = _get_hdfs_user(cluster) oozie_server = _get_oozie_server(cluster) wf_dir = create_workflow_dir(oozie_server, job, hdfs_user) upload_job_files(oozie_server, wf_dir, job, hdfs_user) creator = workflow_factory.get_creator(job) wf_xml = creator.get_workflow_xml(cluster, job_execution, input_source, output_source) path_to_workflow = upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user) rm_path = _get_resource_manager_path(cluster) nn_path = cluster['info']['HDFS']['NameNode'] client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/", _get_oozie_server(cluster)) job_parameters = {"jobTracker": rm_path, "nameNode": nn_path, "user.name": hdfs_user, "oozie.wf.application.path": "%s%s" % (nn_path, path_to_workflow), "oozie.use.system.libpath": "true"} oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters), job_execution) job_execution = conductor.job_execution_update(ctx, job_execution, {'oozie_job_id': oozie_job_id, 'start_time': datetime.datetime.now()}) client.run_job(job_execution, oozie_job_id) return job_execution
def run_job(job_execution): ctx = context.ctx() cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster.status != 'Active': return job_execution job = conductor.job_get(ctx, job_execution.job_id) if not edp.compare_job_type(job.type, 'Java'): input_source = conductor.data_source_get(ctx, job_execution.input_id) output_source = conductor.data_source_get(ctx, job_execution.output_id) else: input_source = None output_source = None #TODO(nprivalova): should be removed after all features implemented validate(input_source, output_source, job) for data_source in [input_source, output_source]: if data_source and data_source.type == 'hdfs': h.configure_cluster_for_hdfs(cluster, data_source) hdfs_user = _get_hdfs_user(cluster) oozie_server = _get_oozie_server(cluster) wf_dir = create_workflow_dir(oozie_server, job, hdfs_user) upload_job_files(oozie_server, wf_dir, job, hdfs_user) creator = workflow_factory.get_creator(job) wf_xml = creator.get_workflow_xml(cluster, job_execution, input_source, output_source) path_to_workflow = upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user) rm_path = _get_resource_manager_path(cluster) nn_path = cluster['info']['HDFS']['NameNode'] client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/", _get_oozie_server(cluster)) job_parameters = {"jobTracker": rm_path, "nameNode": nn_path, "user.name": hdfs_user, "oozie.wf.application.path": "%s%s" % (nn_path, path_to_workflow), "oozie.use.system.libpath": "true"} oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters), job_execution) job_execution = conductor.job_execution_update(ctx, job_execution, {'oozie_job_id': oozie_job_id, 'start_time': datetime.datetime.now()}) client.run_job(job_execution, oozie_job_id) return job_execution
def test_compare_job_type(self): self.assertTrue(edp.compare_job_type( edp.JOB_TYPE_JAVA, edp.JOB_TYPE_JAVA, edp.JOB_TYPE_MAPREDUCE, strict=True)) self.assertFalse(edp.compare_job_type( edp.JOB_TYPE_MAPREDUCE_STREAMING, edp.JOB_TYPE_JAVA, edp.JOB_TYPE_MAPREDUCE, strict=True)) self.assertTrue(edp.compare_job_type( edp.JOB_TYPE_MAPREDUCE_STREAMING, edp.JOB_TYPE_JAVA, edp.JOB_TYPE_MAPREDUCE)) self.assertFalse(edp.compare_job_type( edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_JAVA, edp.JOB_TYPE_MAPREDUCE_STREAMING))
def get_possible_job_config(job_type): if edp.compare_job_type(job_type, edp.JOB_TYPE_HIVE): return { 'job_config': ch_helper.get_possible_hive_config_from( 'plugins/vanilla/v2_6_0/resources/hive-default.xml') } if edp.compare_job_type(job_type, edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_MAPREDUCE_STREAMING): return { 'job_config': ch_helper.get_possible_mapreduce_config_from( 'plugins/vanilla/v2_6_0/resources/mapred-default.xml') } if edp.compare_job_type(job_type, edp.JOB_TYPE_PIG): return { 'job_config': ch_helper.get_possible_pig_config_from( 'plugins/vanilla/v2_6_0/resources/mapred-default.xml') } return edp_engine.EdpOozieEngine.get_possible_job_config(job_type)
def _create_job(id, job_binary, type): job = mock.Mock() job.id = id job.type = type job.name = 'special_name' if edp.compare_job_type(type, edp.JOB_TYPE_PIG, edp.JOB_TYPE_HIVE): job.mains = [job_binary] job.libs = None else: job.libs = [job_binary] job.mains = None return job
def _create_job(id, job_binary, type): job = mock.Mock() job.id = id job.type = type job.name = 'special_name' if edp.compare_job_type(type, 'Pig', 'Hive'): job.mains = [job_binary] job.libs = None else: job.libs = [job_binary] job.mains = None return job
def _create_job_exec(job_id, type, configs=None, info=None): j_exec = mock.Mock() j_exec.id = six.text_type(uuid.uuid4()) j_exec.job_id = job_id j_exec.job_configs = configs j_exec.info = info if not j_exec.job_configs: j_exec.job_configs = {} if edp.compare_job_type(type, edp.JOB_TYPE_JAVA): j_exec.job_configs['configs']['edp.java.main_class'] = _java_main_class j_exec.job_configs['configs']['edp.java.java_opts'] = _java_opts return j_exec
def get_possible_job_config(job_type): if not edp.compare_job_type(job_type, *get_possible_job_types()): return None if edp.compare_job_type(job_type, 'Java'): return {'job_config': {'configs': [], 'args': []}} if edp.compare_job_type(job_type, 'MapReduce', 'Pig'): #TODO(nmakhotkin) Here we should return config based on specific plugin cfg = xmlutils.load_hadoop_xml_defaults( 'plugins/vanilla/v1_2_1/resources/mapred-default.xml') if edp.compare_job_type(job_type, 'MapReduce'): cfg += xmlutils.load_hadoop_xml_defaults( 'service/edp/resources/mapred-job-config.xml') elif edp.compare_job_type(job_type, 'Hive'): #TODO(nmakhotkin) Here we should return config based on specific plugin cfg = xmlutils.load_hadoop_xml_defaults( 'plugins/vanilla/v1_2_1/resources/hive-default.xml') # TODO(tmckay): args should be a list when bug #269968 # is fixed on the UI side config = {'configs': cfg, "args": {}} if not edp.compare_job_type('MapReduce', 'Java'): config.update({'params': {}}) return {'job_config': config}
def _create_job_exec(job_id, type, configs=None, info=None): j_exec = mock.Mock() j_exec.id = uuidutils.generate_uuid() j_exec.job_id = job_id j_exec.job_configs = configs j_exec.info = info j_exec.input_id = 4 j_exec.output_id = 5 j_exec.engine_job_id = None j_exec.data_source_urls = {} if not j_exec.job_configs: j_exec.job_configs = {} if edp.compare_job_type(type, edp.JOB_TYPE_JAVA): j_exec.job_configs['configs']['edp.java.main_class'] = _java_main_class j_exec.job_configs['configs']['edp.java.java_opts'] = _java_opts return j_exec
def get_data_sources(job_execution, job, data_source_urls): if edp.compare_job_type(job.type, edp.JOB_TYPE_JAVA, edp.JOB_TYPE_SPARK): return None, None ctx = context.ctx() input_source = conductor.data_source_get(ctx, job_execution.input_id) if input_source and input_source.id not in data_source_urls: data_source_urls[input_source.id] = _construct_data_source_url( input_source.url, job_execution.id) output_source = conductor.data_source_get(ctx, job_execution.output_id) if output_source and output_source.id not in data_source_urls: data_source_urls[output_source.id] = _construct_data_source_url( output_source.url, job_execution.id) return input_source, output_source
def edp_testing(self, job_type, job_data_list, lib_data_list=None, configs=None, pass_input_output_args=False, swift_binaries=False, hdfs_local_output=False): try: swift = self.connect_to_swift() container_name = 'Edp-test-%s' % str(uuid.uuid4())[:8] swift.put_container(container_name) swift.put_object( container_name, 'input', ''.join( random.choice(':' + ' ' + '\n' + string.ascii_lowercase) for x in range(10000))) except Exception as e: with excutils.save_and_reraise_exception(): self.delete_swift_container(swift, container_name) print(str(e)) input_id = None output_id = None job_id = None job_execution = None try: job_binary_list = [] lib_binary_list = [] job_binary_internal_list = [] swift_input_url = 'swift://%s.sahara/input' % container_name if hdfs_local_output: # This will create a file in hdfs under the user # executing the job (i.e. /usr/hadoop/Edp-test-xxxx-out) output_type = "hdfs" output_url = container_name + "-out" else: output_type = "swift" output_url = 'swift://%s.sahara/output' % container_name # Java jobs don't use data sources. Input/output paths must # be passed as args with corresponding username/password configs if not edp.compare_job_type(job_type, edp.JOB_TYPE_JAVA): input_id = self._create_data_source( 'input-%s' % str(uuid.uuid4())[:8], 'swift', swift_input_url) output_id = self._create_data_source( 'output-%s' % str(uuid.uuid4())[:8], output_type, output_url) if job_data_list: if swift_binaries: self._create_job_binaries(job_data_list, job_binary_internal_list, job_binary_list, swift_connection=swift, container_name=container_name) else: self._create_job_binaries(job_data_list, job_binary_internal_list, job_binary_list) if lib_data_list: if swift_binaries: self._create_job_binaries(lib_data_list, job_binary_internal_list, lib_binary_list, swift_connection=swift, container_name=container_name) else: self._create_job_binaries(lib_data_list, job_binary_internal_list, lib_binary_list) job_id = self._create_job( 'Edp-test-job-%s' % str(uuid.uuid4())[:8], job_type, job_binary_list, lib_binary_list) if not configs: configs = {} # Append the input/output paths with the swift configs # if the caller has requested it... if edp.compare_job_type( job_type, edp.JOB_TYPE_JAVA) and pass_input_output_args: self._add_swift_configs(configs) if "args" in configs: configs["args"].extend([swift_input_url, output_url]) else: configs["args"] = [swift_input_url, output_url] job_execution = self.sahara.job_executions.create(job_id, self.cluster_id, input_id, output_id, configs=configs) if job_execution: self._await_job_execution(job_execution) except Exception as e: with excutils.save_and_reraise_exception(): print(str(e)) finally: self.delete_swift_container(swift, container_name) self._delete_job(job_execution, job_id, job_binary_list + lib_binary_list, job_binary_internal_list, input_id, output_id)
def edp_testing(self, job_type, job_data_list, lib_data_list=None, configs=None, pass_input_output_args=False, swift_binaries=False, hdfs_local_output=False): job_data_list = job_data_list or [] lib_data_list = lib_data_list or [] configs = configs or {} swift = self.connect_to_swift() container_name = 'Edp-test-%s' % str(uuid.uuid4())[:8] swift.put_container(container_name) if not self.common_config.RETAIN_EDP_AFTER_TEST: self.addCleanup(self.delete_swift_container, swift, container_name) swift.put_object( container_name, 'input', ''.join( random.choice(':' + ' ' + '\n' + string.ascii_lowercase) for x in six.moves.range(10000))) input_id = None output_id = None job_id = None job_execution = None job_binary_list = [] lib_binary_list = [] job_binary_internal_list = [] swift_input_url = 'swift://%s.sahara/input' % container_name if hdfs_local_output: # This will create a file in hdfs under the user # executing the job (i.e. /usr/hadoop/Edp-test-xxxx-out) output_type = "hdfs" output_url = container_name + "-out" else: output_type = "swift" output_url = 'swift://%s.sahara/output' % container_name # Java jobs don't use data sources. Input/output paths must # be passed as args with corresponding username/password configs if not edp.compare_job_type(job_type, edp.JOB_TYPE_JAVA, edp.JOB_TYPE_SPARK): input_id = self._create_data_source( 'input-%s' % str(uuid.uuid4())[:8], 'swift', swift_input_url) output_id = self._create_data_source( 'output-%s' % str(uuid.uuid4())[:8], output_type, output_url) if job_data_list: if swift_binaries: self._create_job_binaries(job_data_list, job_binary_internal_list, job_binary_list, swift_connection=swift, container_name=container_name) else: self._create_job_binaries(job_data_list, job_binary_internal_list, job_binary_list) if lib_data_list: if swift_binaries: self._create_job_binaries(lib_data_list, job_binary_internal_list, lib_binary_list, swift_connection=swift, container_name=container_name) else: self._create_job_binaries(lib_data_list, job_binary_internal_list, lib_binary_list) job_id = self._create_job('Edp-test-job-%s' % str(uuid.uuid4())[:8], job_type, job_binary_list, lib_binary_list) if not configs: configs = {} # TODO(tmckay): for spark we don't have support for swift # yet. When we do, we'll need something to here to set up # swift paths and we can use a spark wordcount job # Append the input/output paths with the swift configs # if the caller has requested it... if edp.compare_job_type(job_type, edp.JOB_TYPE_JAVA) and pass_input_output_args: self._add_swift_configs(configs) if "args" in configs: configs["args"].extend([swift_input_url, output_url]) else: configs["args"] = [swift_input_url, output_url] job_execution = self.sahara.job_executions.create(job_id, self.cluster_id, input_id, output_id, configs=configs) if not self.common_config.RETAIN_EDP_AFTER_TEST: self.addCleanup(self.sahara.job_executions.delete, job_execution.id) return job_execution.id
def edp_testing(self, job_type, job_data_list, lib_data_list=None, configs=None, pass_input_output_args=False, swift_binaries=False, hdfs_local_output=False): try: swift = self.connect_to_swift() container_name = 'Edp-test-%s' % str(uuid.uuid4())[:8] swift.put_container(container_name) swift.put_object( container_name, 'input', ''.join( random.choice(':' + ' ' + '\n' + string.ascii_lowercase) for x in range(10000) ) ) except Exception as e: with excutils.save_and_reraise_exception(): self.delete_swift_container(swift, container_name) print(str(e)) input_id = None output_id = None job_id = None job_execution = None try: job_binary_list = [] lib_binary_list = [] job_binary_internal_list = [] swift_input_url = 'swift://%s.sahara/input' % container_name if hdfs_local_output: # This will create a file in hdfs under the user # executing the job (i.e. /usr/hadoop/Edp-test-xxxx-out) output_type = "hdfs" output_url = container_name + "-out" else: output_type = "swift" output_url = 'swift://%s.sahara/output' % container_name # Java jobs don't use data sources. Input/output paths must # be passed as args with corresponding username/password configs if not edp.compare_job_type(job_type, edp.JOB_TYPE_JAVA): input_id = self._create_data_source( 'input-%s' % str(uuid.uuid4())[:8], 'swift', swift_input_url) output_id = self._create_data_source( 'output-%s' % str(uuid.uuid4())[:8], output_type, output_url) if job_data_list: if swift_binaries: self._create_job_binaries(job_data_list, job_binary_internal_list, job_binary_list, swift_connection=swift, container_name=container_name) else: self._create_job_binaries(job_data_list, job_binary_internal_list, job_binary_list) if lib_data_list: if swift_binaries: self._create_job_binaries(lib_data_list, job_binary_internal_list, lib_binary_list, swift_connection=swift, container_name=container_name) else: self._create_job_binaries(lib_data_list, job_binary_internal_list, lib_binary_list) job_id = self._create_job( 'Edp-test-job-%s' % str(uuid.uuid4())[:8], job_type, job_binary_list, lib_binary_list) if not configs: configs = {} # Append the input/output paths with the swift configs # if the caller has requested it... if edp.compare_job_type( job_type, edp.JOB_TYPE_JAVA) and pass_input_output_args: self._add_swift_configs(configs) if "args" in configs: configs["args"].extend([swift_input_url, output_url]) else: configs["args"] = [swift_input_url, output_url] job_execution = self.sahara.job_executions.create( job_id, self.cluster_id, input_id, output_id, configs=configs) if job_execution: self._await_job_execution(job_execution) except Exception as e: with excutils.save_and_reraise_exception(): print(str(e)) finally: self.delete_swift_container(swift, container_name) self._delete_job( job_execution, job_id, job_binary_list+lib_binary_list, job_binary_internal_list, input_id, output_id )
def validate(input_data, output_data, job): if not edp.compare_job_type(job.type, edp.JOB_TYPE_PIG, edp.JOB_TYPE_MAPREDUCE, edp.JOB_TYPE_HIVE, edp.JOB_TYPE_JAVA): raise RuntimeError
def validate(input_data, output_data, job): if not edp.compare_job_type(job.type, 'Pig', 'MapReduce', 'Hive', 'Java'): raise RuntimeError
def compare_job_type(job_type, *args, **kwargs): return edp.compare_job_type(job_type, *args, **kwargs)
def edp_testing(self, job_type, job_data_list, lib_data_list=None, configs=None, pass_input_output_args=False, swift_binaries=False, hdfs_local_output=False): job_data_list = job_data_list or [] lib_data_list = lib_data_list or [] configs = configs or {} test_id = 'edp-mapr-test-%s' % str(uuid.uuid4())[:8] swift = self.connect_to_swift() container = test_id swift.put_container(container) input_folder = '/%s' % test_id cldb_ip = self.cluster_info['node_info']['namenode_ip'] self.create_mapr_fs_dir(cldb_ip, input_folder) if not self.common_config.RETAIN_EDP_AFTER_TEST: self.addCleanup(self.delete_swift_container, swift, container) input_data = ''.join( random.choice(':' + ' ' + '\n' + string.ascii_lowercase) for x in six.moves.range(10000)) input_file = '%s/input' % input_folder self.put_file_to_mapr_fs(cldb_ip, input_file, input_data) input_id = None output_id = None job_binary_list = [] lib_binary_list = [] job_binary_internal_list = [] maprfs_input_url = 'maprfs://%s' % input_file maprfs_output_url = 'maprfs://%s/output' % (input_folder + '-out') if not utils_edp.compare_job_type(job_type, utils_edp.JOB_TYPE_JAVA, utils_edp.JOB_TYPE_SPARK): input_id = self._create_data_source( 'input-%s' % str(uuid.uuid4())[:8], 'maprfs', maprfs_input_url) output_id = self._create_data_source( 'output-%s' % str(uuid.uuid4())[:8], 'maprfs', maprfs_output_url) if job_data_list: if swift_binaries: self._create_job_binaries(job_data_list, job_binary_internal_list, job_binary_list, swift_connection=swift, container_name=container) else: self._create_job_binaries(job_data_list, job_binary_internal_list, job_binary_list) if lib_data_list: if swift_binaries: self._create_job_binaries(lib_data_list, job_binary_internal_list, lib_binary_list, swift_connection=swift, container_name=container) else: self._create_job_binaries(lib_data_list, job_binary_internal_list, lib_binary_list) job_id = self._create_job('edp-test-job-%s' % str(uuid.uuid4())[:8], job_type, job_binary_list, lib_binary_list) if not configs: configs = {} if utils_edp.compare_job_type( job_type, utils_edp.JOB_TYPE_JAVA) and pass_input_output_args: self._enable_substitution(configs) if "args" in configs: configs["args"].extend([maprfs_input_url, maprfs_output_url]) else: configs["args"] = [maprfs_input_url, maprfs_output_url] job_execution = self.sahara.job_executions.create(job_id, self.cluster_id, input_id, output_id, configs=configs) if not self.common_config.RETAIN_EDP_AFTER_TEST: self.addCleanup(self.sahara.job_executions.delete, job_execution.id) return job_execution.id
def edp_testing(self, job_type, job_data_list, lib_data_list=None, configs=None, pass_input_output_args=False, swift_binaries=False, hdfs_local_output=False): job_data_list = job_data_list or [] lib_data_list = lib_data_list or [] configs = configs or {} test_id = 'edp-mapr-test-%s' % str(uuid.uuid4())[:8] swift = self.connect_to_swift() container = test_id swift.put_container(container) input_folder = '/%s' % test_id cldb_ip = self.cluster_info['node_info']['namenode_ip'] self.create_mapr_fs_dir(cldb_ip, input_folder) if not self.common_config.RETAIN_EDP_AFTER_TEST: self.addCleanup(self.delete_swift_container, swift, container) input_data = ''.join( random.choice(':' + ' ' + '\n' + string.ascii_lowercase) for x in six.moves.range(10000) ) input_file = '%s/input' % input_folder self.put_file_to_mapr_fs(cldb_ip, input_file, input_data) input_id = None output_id = None job_binary_list = [] lib_binary_list = [] job_binary_internal_list = [] maprfs_input_url = 'maprfs://%s' % input_file maprfs_output_url = 'maprfs://%s/output' % (input_folder + '-out') if not utils_edp.compare_job_type(job_type, utils_edp.JOB_TYPE_JAVA, utils_edp.JOB_TYPE_SPARK): input_id = self._create_data_source( 'input-%s' % str(uuid.uuid4())[:8], 'maprfs', maprfs_input_url) output_id = self._create_data_source( 'output-%s' % str(uuid.uuid4())[:8], 'maprfs', maprfs_output_url) if job_data_list: if swift_binaries: self._create_job_binaries(job_data_list, job_binary_internal_list, job_binary_list, swift_connection=swift, container_name=container) else: self._create_job_binaries(job_data_list, job_binary_internal_list, job_binary_list) if lib_data_list: if swift_binaries: self._create_job_binaries(lib_data_list, job_binary_internal_list, lib_binary_list, swift_connection=swift, container_name=container) else: self._create_job_binaries(lib_data_list, job_binary_internal_list, lib_binary_list) job_id = self._create_job( 'edp-test-job-%s' % str(uuid.uuid4())[:8], job_type, job_binary_list, lib_binary_list) if not configs: configs = {} if utils_edp.compare_job_type( job_type, utils_edp.JOB_TYPE_JAVA) and pass_input_output_args: self._enable_substitution(configs) if "args" in configs: configs["args"].extend([maprfs_input_url, maprfs_output_url]) else: configs["args"] = [maprfs_input_url, maprfs_output_url] job_execution = self.sahara.job_executions.create( job_id, self.cluster_id, input_id, output_id, configs=configs) if not self.common_config.RETAIN_EDP_AFTER_TEST: self.addCleanup(self.sahara.job_executions.delete, job_execution.id) return job_execution.id
def edp_testing(self, job_type, job_data_list, lib_data_list=None, configs=None, pass_input_output_args=False, swift_binaries=False, hdfs_local_output=False): job_data_list = job_data_list or [] lib_data_list = lib_data_list or [] configs = configs or {} swift = self.connect_to_swift() container_name = 'Edp-test-%s' % str(uuid.uuid4())[:8] swift.put_container(container_name) if not self.common_config.RETAIN_EDP_AFTER_TEST: self.addCleanup(self.delete_swift_container, swift, container_name) swift.put_object( container_name, 'input', ''.join( random.choice(':' + ' ' + '\n' + string.ascii_lowercase) for x in six.moves.range(10000) ) ) input_id = None output_id = None job_id = None job_execution = None job_binary_list = [] lib_binary_list = [] job_binary_internal_list = [] swift_input_url = 'swift://%s.sahara/input' % container_name if hdfs_local_output: # This will create a file in hdfs under the user # executing the job (i.e. /usr/hadoop/Edp-test-xxxx-out) output_type = "hdfs" output_url = container_name + "-out" else: output_type = "swift" output_url = 'swift://%s.sahara/output' % container_name input_name = 'input-%s' % str(uuid.uuid4())[:8] input_id = self._create_data_source(input_name, 'swift', swift_input_url) output_name = 'output-%s' % str(uuid.uuid4())[:8] output_id = self._create_data_source(output_name, output_type, output_url) if job_data_list: if swift_binaries: self._create_job_binaries(job_data_list, job_binary_internal_list, job_binary_list, swift_connection=swift, container_name=container_name) else: self._create_job_binaries(job_data_list, job_binary_internal_list, job_binary_list) if lib_data_list: if swift_binaries: self._create_job_binaries(lib_data_list, job_binary_internal_list, lib_binary_list, swift_connection=swift, container_name=container_name) else: self._create_job_binaries(lib_data_list, job_binary_internal_list, lib_binary_list) job_id = self._create_job( 'Edp-test-job-%s' % str(uuid.uuid4())[:8], job_type, job_binary_list, lib_binary_list) if not configs: configs = {} # TODO(tmckay): for spark we don't have support for swift # yet. When we do, we'll need something to here to set up # swift paths and we can use a spark wordcount job # Append the input/output paths with the swift configs # if the caller has requested it... if edp.compare_job_type( job_type, edp.JOB_TYPE_JAVA) and pass_input_output_args: self._enable_substitution(configs) input_arg = job_utils.DATA_SOURCE_PREFIX + input_name output_arg = output_id if "args" in configs: configs["args"].extend([input_arg, output_arg]) else: configs["args"] = [input_arg, output_arg] job_execution = self.sahara.job_executions.create( job_id, self.cluster_id, input_id, output_id, configs=configs) if not self.common_config.RETAIN_EDP_AFTER_TEST: self.addCleanup(self.sahara.job_executions.delete, job_execution.id) return job_execution.id
def run_job(job_execution_id): ctx = context.ctx() job_execution = conductor.job_execution_get(ctx, job_execution_id) cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster.status != 'Active': return if CONF.use_namespaces and not CONF.use_floating_ips: plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) oozie = plugin.get_oozie_server(cluster) info = oozie.remote().get_neutron_info() extra = job_execution.extra.copy() extra['neutron'] = info job_execution = conductor.job_execution_update(ctx, job_execution_id, {'extra': extra}) job = conductor.job_get(ctx, job_execution.job_id) if not edp.compare_job_type(job.type, edp.JOB_TYPE_JAVA): input_source = conductor.data_source_get(ctx, job_execution.input_id) output_source = conductor.data_source_get(ctx, job_execution.output_id) else: input_source = None output_source = None for data_source in [input_source, output_source]: if data_source and data_source.type == 'hdfs': h.configure_cluster_for_hdfs(cluster, data_source) hdfs_user = _get_hdfs_user(cluster) oozie_server = _get_oozie_server(cluster) wf_dir = create_workflow_dir(oozie_server, job, hdfs_user) upload_job_files(oozie_server, wf_dir, job, hdfs_user) creator = workflow_factory.get_creator(job) wf_xml = creator.get_workflow_xml(cluster, job_execution, input_source, output_source) path_to_workflow = upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user) plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) rm_path = plugin.get_resource_manager_uri(cluster) nn_path = plugin.get_name_node_uri(cluster) client = _create_oozie_client(cluster) job_parameters = {"jobTracker": rm_path, "nameNode": nn_path, "user.name": hdfs_user, "oozie.wf.application.path": "%s%s" % (nn_path, path_to_workflow), "oozie.use.system.libpath": "true"} oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters), job_execution) job_execution = conductor.job_execution_update(ctx, job_execution, {'oozie_job_id': oozie_job_id, 'start_time': datetime.datetime.now()}) client.run_job(job_execution, oozie_job_id)