def test_build_workflow_swift_configs(self, job_binary): # Test that swift configs come from either input or output data sources job, job_exec = _create_all_stack(edp.JOB_TYPE_PIG) job_binary.return_value = {"name": "script.pig"} input_data = _create_data_source('swift://ex/i') output_data = _create_data_source('hdfs://user/hadoop/out') res = workflow_factory.get_workflow_xml( job, _create_cluster(), job_exec, input_data, output_data) self.assertIn(""" <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration>""", res) input_data = _create_data_source('hdfs://user/hadoop/in') output_data = _create_data_source('swift://ex/o') res = workflow_factory.get_workflow_xml( job, _create_cluster(), job_exec, input_data, output_data) self.assertIn(""" <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration>""", res) job, job_exec = _create_all_stack( edp.JOB_TYPE_PIG, configs={'configs': {'dummy': 'value'}}) input_data = _create_data_source('hdfs://user/hadoop/in') output_data = _create_data_source('hdfs://user/hadoop/out') res = workflow_factory.get_workflow_xml( job, _create_cluster(), job_exec, input_data, output_data) self.assertIn(""" <configuration> <property> <name>dummy</name> <value>value</value> </property> </configuration>""", res)
def test_build_workflow_for_job_pig(self, job_binary): job, job_exec = _create_all_stack(edp.JOB_TYPE_PIG) job_binary.return_value = {"name": "script.pig"} input_data = _create_data_source('swift://ex/i') output_data = _create_data_source('swift://ex/o') res = workflow_factory.get_workflow_xml( job, _create_cluster(), job_exec, input_data, output_data) self.assertIn(""" <param>INPUT=swift://ex.sahara/i</param> <param>OUTPUT=swift://ex.sahara/o</param>""", res) self.assertIn(""" <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration>""", res) self.assertIn("<script>script.pig</script>", res)
def _build_workflow_with_conf_common(self, job_type): job, _ = _create_all_stack(job_type) input_data = _create_data_source('swift://ex/i') output_data = _create_data_source('swift://ex/o') job_exec = _create_job_exec(job.id, job_type, configs={"configs": {'c': 'f'}}) res = workflow_factory.get_workflow_xml( job, _create_cluster(), job_exec, input_data, output_data) self.assertIn(""" <property> <name>c</name> <value>f</value> </property>""", res) self.assertIn(""" <property> <name>mapred.input.dir</name> <value>swift://ex.sahara/i</value> </property>""", res) self.assertIn(""" <property> <name>mapred.output.dir</name> <value>swift://ex.sahara/o</value> </property>""", res)
def test_build_workflow_for_job_java(self): # If args include swift paths, user and password values # will have to be supplied via configs instead of being # lifted from input or output data sources configs = {sw.HADOOP_SWIFT_USERNAME: '******', sw.HADOOP_SWIFT_PASSWORD: '******'} configs = { 'configs': configs, 'args': ['swift://ex/i', 'output_path'] } job, job_exec = _create_all_stack(edp.JOB_TYPE_JAVA, configs) res = workflow_factory.get_workflow_xml( job, _create_cluster(), job_exec) self.assertIn(""" <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration> <main-class>%s</main-class> <java-opts>%s</java-opts> <arg>swift://ex.sahara/i</arg> <arg>output_path</arg>""" % (_java_main_class, _java_opts), res)
def _build_workflow_common(self, job_type, streaming=False): if streaming: configs = {'edp.streaming.mapper': '/usr/bin/cat', 'edp.streaming.reducer': '/usr/bin/wc'} configs = {'configs': configs} else: configs = {} job, job_exec = _create_all_stack(job_type, configs) input_data = _create_data_source('swift://ex/i') output_data = _create_data_source('swift://ex/o') res = workflow_factory.get_workflow_xml( job, _create_cluster(), job_exec, input_data, output_data) if streaming: self.assertIn(""" <streaming> <mapper>/usr/bin/cat</mapper> <reducer>/usr/bin/wc</reducer> </streaming>""", res) self.assertIn(""" <property> <name>mapred.output.dir</name> <value>swift://ex.sahara/o</value> </property>""", res) self.assertIn(""" <property> <name>mapred.input.dir</name> <value>swift://ex.sahara/i</value> </property>""", res) self.assertIn(""" <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property>""", res) self.assertIn(""" <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property>""", res)
def _run_job(job_execution_id): ctx = context.ctx() job_execution = conductor.job_execution_get(ctx, job_execution_id) cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster.status != 'Active': return job_execution = _update_job_execution_extra(job_execution, cluster) job = conductor.job_get(ctx, job_execution.job_id) input_source, output_source = _get_data_sources(job_execution, job) for data_source in [input_source, output_source]: if data_source and data_source.type == 'hdfs': h.configure_cluster_for_hdfs(cluster, data_source) plugin = _get_plugin(cluster) hdfs_user = plugin.get_hdfs_user() oozie_server = plugin.get_oozie_server(cluster) wf_dir = create_workflow_dir(oozie_server, job, hdfs_user) upload_job_files(oozie_server, wf_dir, job, hdfs_user) wf_xml = workflow_factory.get_workflow_xml( job, cluster, job_execution, input_source, output_source) path_to_workflow = upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user) client = _create_oozie_client(cluster) job_params = _get_oozie_job_params(cluster, hdfs_user, path_to_workflow) oozie_job_id = client.add_job(x.create_hadoop_xml(job_params), job_execution) job_execution = conductor.job_execution_update( ctx, job_execution, {'oozie_job_id': oozie_job_id, 'start_time': datetime.datetime.now()}) client.run_job(job_execution, oozie_job_id)