def _build_workflow_with_conf_common(self, job_type): input_data = u.create_data_source('swift://ex/i') output_data = u.create_data_source('swift://ex/o') job, job_exec = u.create_job_exec(job_type, configs={"configs": {'c': 'f'}}) res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec, input_data, output_data, 'hadoop') self.assertIn(""" <property> <name>c</name> <value>f</value> </property>""", res) self.assertIn(""" <property> <name>mapred.input.dir</name> <value>swift://ex.sahara/i</value> </property>""", res) self.assertIn(""" <property> <name>mapred.output.dir</name> <value>swift://ex.sahara/o</value> </property>""", res)
def test_build_workflow_for_job_hive(self, job_binary): job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, configs={}) job_binary.return_value = {"name": "script.q"} input_data = u.create_data_source('swift://ex/i') output_data = u.create_data_source('swift://ex/o') res = workflow_factory.get_workflow_xml(job, u.create_cluster(), job_exec, input_data, output_data, 'hadoop') doc = xml.parseString(res) hive = doc.getElementsByTagName('hive')[0] self.assertEqual(xmlutils.get_text_from_node(hive, 'job-xml'), '/user/hadoop/conf/hive-site.xml') configuration = hive.getElementsByTagName('configuration') properties = xmlutils.get_property_dict(configuration[0]) self.assertEqual( { 'fs.swift.service.sahara.password': '******', 'fs.swift.service.sahara.username': '******' }, properties) self.assertEqual(xmlutils.get_text_from_node(hive, 'script'), 'script.q') params = xmlutils.get_param_dict(hive) self.assertEqual( { 'INPUT': 'swift://ex.sahara/i', 'OUTPUT': 'swift://ex.sahara/o' }, params) # testing workflow creation with a proxy domain self.override_config('use_domain_for_proxy_users', True) self.override_config("proxy_user_domain_name", 'sahara_proxy_domain') job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, proxy=True) res = workflow_factory.get_workflow_xml(job, u.create_cluster(), job_exec, input_data, output_data, 'hadoop') doc = xml.parseString(res) hive = doc.getElementsByTagName('hive')[0] configuration = hive.getElementsByTagName('configuration') properties = xmlutils.get_property_dict(configuration[0]) self.assertEqual( { 'fs.swift.service.sahara.domain.name': 'sahara_proxy_domain', 'fs.swift.service.sahara.trust.id': '0123456789abcdef0123456789abcdef', 'fs.swift.service.sahara.password': '******', 'fs.swift.service.sahara.username': '******' }, properties)
def test_build_workflow_for_job_pig(self, job_binary): job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={}) job_binary.return_value = {"name": "script.pig"} input_data = u.create_data_source('swift://ex/i') output_data = u.create_data_source('swift://ex/o') data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url} res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, 'hadoop', data_source_urls) self.assertIn(""" <param>INPUT=swift://ex.sahara/i</param> <param>OUTPUT=swift://ex.sahara/o</param>""", res) self.assertIn(""" <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration>""", res) self.assertIn("<script>script.pig</script>", res) # testing workflow creation with a proxy domain self.override_config('use_domain_for_proxy_users', True) self.override_config("proxy_user_domain_name", 'sahara_proxy_domain') job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, proxy=True) res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, 'hadoop', data_source_urls) self.assertIn(""" <configuration> <property> <name>fs.swift.service.sahara.domain.name</name> <value>sahara_proxy_domain</value> </property> <property> <name>fs.swift.service.sahara.password</name> <value>55555555-6666-7777-8888-999999999999</value> </property> <property> <name>fs.swift.service.sahara.trust.id</name> <value>0123456789abcdef0123456789abcdef</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>job_00000000-1111-2222-3333-4444444444444444</value> </property> </configuration>""", res)
def test_build_workflow_for_job_hive(self, job_binary): job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE) job_binary.return_value = {"name": "script.q"} input_data = u.create_data_source('swift://ex/i') output_data = u.create_data_source('swift://ex/o') res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec, input_data, output_data, 'hadoop') self.assertIn(""" <job-xml>/user/hadoop/conf/hive-site.xml</job-xml> <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration> <script>script.q</script> <param>INPUT=swift://ex.sahara/i</param> <param>OUTPUT=swift://ex.sahara/o</param>""", res)
def test_build_workflow_for_job_pig(self, job_binary): job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={}) job_binary.return_value = {"name": "script.pig"} input_data = u.create_data_source('swift://ex/i') output_data = u.create_data_source('swift://ex/o') data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url} res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, 'hadoop', data_source_urls) self.assertIn(""" <param>INPUT=swift://ex.sahara/i</param> <param>OUTPUT=swift://ex.sahara/o</param>""", res) self.assertIn(""" <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration>""", res) self.assertIn("<script>script.pig</script>", res) # testing workflow creation with a proxy domain self.override_config('use_domain_for_proxy_users', True) self.override_config("proxy_user_domain_name", 'sahara_proxy_domain') job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, proxy=True) res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, 'hadoop', data_source_urls) self.assertIn(""" <configuration> <property> <name>fs.swift.service.sahara.domain.name</name> <value>sahara_proxy_domain</value> </property> <property> <name>fs.swift.service.sahara.password</name> <value>55555555-6666-7777-8888-999999999999</value> </property> <property> <name>fs.swift.service.sahara.trust.id</name> <value>0123456789abcdef0123456789abcdef</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>job_00000000-1111-2222-3333-4444444444444444</value> </property> </configuration>""", res)
def test_build_workflow_for_job_hive(self, job_binary): job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, configs={}) job_binary.return_value = {"name": "script.q"} input_data = u.create_data_source('swift://ex/i') output_data = u.create_data_source('swift://ex/o') data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url} res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, 'hadoop', data_source_urls) doc = xml.parseString(res) hive = doc.getElementsByTagName('hive')[0] self.assertEqual('/user/hadoop/conf/hive-site.xml', xmlutils.get_text_from_node(hive, 'job-xml')) configuration = hive.getElementsByTagName('configuration') properties = xmlutils.get_property_dict(configuration[0]) self.assertEqual({'fs.swift.service.sahara.password': '******', 'fs.swift.service.sahara.username': '******'}, properties) self.assertEqual('script.q', xmlutils.get_text_from_node(hive, 'script')) params = xmlutils.get_param_dict(hive) self.assertEqual({'INPUT': 'swift://ex.sahara/i', 'OUTPUT': 'swift://ex.sahara/o'}, params) # testing workflow creation with a proxy domain self.override_config('use_domain_for_proxy_users', True) self.override_config("proxy_user_domain_name", 'sahara_proxy_domain') job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, proxy=True) res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, 'hadoop', data_source_urls) doc = xml.parseString(res) hive = doc.getElementsByTagName('hive')[0] configuration = hive.getElementsByTagName('configuration') properties = xmlutils.get_property_dict(configuration[0]) self.assertEqual({ 'fs.swift.service.sahara.domain.name': 'sahara_proxy_domain', 'fs.swift.service.sahara.trust.id': '0123456789abcdef0123456789abcdef', 'fs.swift.service.sahara.password': '******', 'fs.swift.service.sahara.username': '******'}, properties)
def test_build_workflow_for_job_hive(self, job_binary): job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, configs={}) job_binary.return_value = {"name": "script.q"} input_data = u.create_data_source("swift://ex/i") output_data = u.create_data_source("swift://ex/o") data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url} res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls ) doc = xml.parseString(res) hive = doc.getElementsByTagName("hive")[0] self.assertEqual("/user/hadoop/conf/hive-site.xml", xmlutils.get_text_from_node(hive, "job-xml")) configuration = hive.getElementsByTagName("configuration") properties = xmlutils.get_property_dict(configuration[0]) self.assertEqual( {"fs.swift.service.sahara.password": "******", "fs.swift.service.sahara.username": "******"}, properties ) self.assertEqual("script.q", xmlutils.get_text_from_node(hive, "script")) params = xmlutils.get_param_dict(hive) self.assertEqual({"INPUT": "swift://ex.sahara/i", "OUTPUT": "swift://ex.sahara/o"}, params) # testing workflow creation with a proxy domain self.override_config("use_domain_for_proxy_users", True) self.override_config("proxy_user_domain_name", "sahara_proxy_domain") job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, proxy=True) res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls ) doc = xml.parseString(res) hive = doc.getElementsByTagName("hive")[0] configuration = hive.getElementsByTagName("configuration") properties = xmlutils.get_property_dict(configuration[0]) self.assertEqual( { "fs.swift.service.sahara.domain.name": "sahara_proxy_domain", "fs.swift.service.sahara.trust.id": "0123456789abcdef0123456789abcdef", "fs.swift.service.sahara.password": "******", "fs.swift.service.sahara.username": "******", }, properties, )
def _build_workflow_common(self, job_type, streaming=False): if streaming: configs = {'edp.streaming.mapper': '/usr/bin/cat', 'edp.streaming.reducer': '/usr/bin/wc'} configs = {'configs': configs} else: configs = {} job, job_exec = u.create_job_exec(job_type, configs) input_data = u.create_data_source('swift://ex/i') output_data = u.create_data_source('swift://ex/o') res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec, input_data, output_data, 'hadoop') if streaming: self.assertIn(""" <streaming> <mapper>/usr/bin/cat</mapper> <reducer>/usr/bin/wc</reducer> </streaming>""", res) self.assertIn(""" <property> <name>mapred.output.dir</name> <value>swift://ex.sahara/o</value> </property>""", res) self.assertIn(""" <property> <name>mapred.input.dir</name> <value>swift://ex.sahara/i</value> </property>""", res) self.assertIn(""" <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property>""", res) self.assertIn(""" <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property>""", res)
def test_build_workflow_swift_configs(self, job_binary): # Test that swift configs come from either input or output data sources job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={}) job_binary.return_value = {"name": "script.pig"} input_data = u.create_data_source('swift://ex/i') output_data = u.create_data_source('hdfs://user/hadoop/out') res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec, input_data, output_data, 'hadoop') self.assertIn(""" <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration>""", res) input_data = u.create_data_source('hdfs://user/hadoop/in') output_data = u.create_data_source('swift://ex/o') res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec, input_data, output_data, 'hadoop') self.assertIn(""" <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration>""", res) job, job_exec = u.create_job_exec( edp.JOB_TYPE_PIG, configs={'configs': {'dummy': 'value'}}) input_data = u.create_data_source('hdfs://user/hadoop/in') output_data = u.create_data_source('hdfs://user/hadoop/out') res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec, input_data, output_data, 'hadoop') self.assertIn(""" <configuration> <property> <name>dummy</name> <value>value</value> </property> </configuration>""", res)
def test_resolve_data_source_refs(self, data_source_get_all, ctx): ctx.return_value = 'dummy' name_ref = job_utils.DATA_SOURCE_PREFIX+'input' input = u.create_data_source("swift://container/input", name="input", id=six.text_type(uuid.uuid4())) output = u.create_data_source("swift://container/output", name="output", id=six.text_type(uuid.uuid4())) by_name = {'input': input, 'output': output} by_id = {input.id: input, output.id: output} # Pretend to be the database def _get_all(ctx, **kwargs): name = kwargs.get('name') if name in by_name: name_list = [by_name[name]] else: name_list = [] id = kwargs.get('id') if id in by_id: id_list = [by_id[id]] else: id_list = [] return list(set(name_list + id_list)) data_source_get_all.side_effect = _get_all job_configs = { 'configs': { job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: True}, 'args': [name_ref, output.id, input.id]} ds, nc = job_utils.resolve_data_source_references(job_configs) self.assertEqual(2, len(ds)) self.assertEqual([input.url, output.url, input.url], nc['args']) # Swift configs should be filled in since they were blank self.assertEqual(input.credentials['user'], nc['configs']['fs.swift.service.sahara.username']) self.assertEqual(input.credentials['password'], nc['configs']['fs.swift.service.sahara.password']) job_configs['configs'] = {'fs.swift.service.sahara.username': '******', 'fs.swift.service.sahara.password': '******', job_utils.DATA_SOURCE_SUBST_NAME: False, job_utils.DATA_SOURCE_SUBST_UUID: True} ds, nc = job_utils.resolve_data_source_references(job_configs) self.assertEqual(2, len(ds)) self.assertEqual([name_ref, output.url, input.url], nc['args']) # Swift configs should not be overwritten self.assertEqual(job_configs['configs'], nc['configs']) job_configs['configs'] = {job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: False} job_configs['proxy_configs'] = {'proxy_username': '******', 'proxy_password': '******', 'proxy_trust_id': 'trustme'} ds, nc = job_utils.resolve_data_source_references(job_configs) self.assertEqual(1, len(ds)) self.assertEqual([input.url, output.id, input.id], nc['args']) # Swift configs should be empty and proxy configs should be preserved self.assertEqual(job_configs['configs'], nc['configs']) self.assertEqual(job_configs['proxy_configs'], nc['proxy_configs']) # Substitution not enabled job_configs['configs'] = {job_utils.DATA_SOURCE_SUBST_NAME: False, job_utils.DATA_SOURCE_SUBST_UUID: False} ds, nc = job_utils.resolve_data_source_references(job_configs) self.assertEqual(0, len(ds)) self.assertEqual(job_configs['args'], nc['args']) self.assertEqual(job_configs['configs'], nc['configs']) # Substitution enabled but no values to modify job_configs['configs'] = {job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: True} job_configs['args'] = ['val1', 'val2', 'val3'] ds, nc = job_utils.resolve_data_source_references(job_configs) self.assertEqual(0, len(ds)) self.assertEqual(job_configs['args'], nc['args']) self.assertEqual(job_configs['configs'], nc['configs'])
def test_resolve_data_source_refs(self, data_source_get_all, ctx): ctx.return_value = 'dummy' name_ref = job_utils.DATA_SOURCE_PREFIX + 'input' job_exec_id = uuidutils.generate_uuid() input_url = "swift://container/input" input = u.create_data_source(input_url, name="input", id=uuidutils.generate_uuid()) output = u.create_data_source("swift://container/output.%JOB_EXEC_ID%", name="output", id=uuidutils.generate_uuid()) output_url = "swift://container/output." + job_exec_id by_name = {'input': input, 'output': output} by_id = {input.id: input, output.id: output} # Pretend to be the database def _get_all(ctx, **kwargs): name = kwargs.get('name') if name in by_name: name_list = [by_name[name]] else: name_list = [] id = kwargs.get('id') if id in by_id: id_list = [by_id[id]] else: id_list = [] return list(set(name_list + id_list)) data_source_get_all.side_effect = _get_all job_configs = { 'configs': { job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: True }, 'args': [name_ref, output.id, input.id] } urls = {} ds, nc = job_utils.resolve_data_source_references( job_configs, job_exec_id, urls) self.assertEqual(2, len(ds)) self.assertEqual([input.url, output_url, input.url], nc['args']) # Substitution not enabled job_configs['configs'] = { job_utils.DATA_SOURCE_SUBST_NAME: False, job_utils.DATA_SOURCE_SUBST_UUID: False } ds, nc = job_utils.resolve_data_source_references( job_configs, job_exec_id, {}) self.assertEqual(0, len(ds)) self.assertEqual(job_configs['args'], nc['args']) self.assertEqual(job_configs['configs'], nc['configs']) # Substitution enabled but no values to modify job_configs['configs'] = { job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: True } job_configs['args'] = ['val1', 'val2', 'val3'] ds, nc = job_utils.resolve_data_source_references( job_configs, job_exec_id, {}) self.assertEqual(0, len(ds)) self.assertEqual(nc['args'], job_configs['args']) self.assertEqual(nc['configs'], job_configs['configs'])
def test_prepare_cluster(self, ctx): ctx.return_value = 'dummy' ds_url = "swift://container/input" ds = u.create_data_source(ds_url, name="data_source", id=uuidutils.generate_uuid()) job_configs = { 'configs': { job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: True } } old_configs = copy.deepcopy(job_configs) self.s_type.prepare_cluster(ds, u.create_cluster(), job_configs=job_configs) # Swift configs should be filled in since they were blank self.assertEqual(ds.credentials['user'], job_configs['configs'] ['fs.swift.service.sahara.username']) self.assertEqual(ds.credentials['password'], job_configs['configs'] ['fs.swift.service.sahara.password']) self.assertNotEqual(old_configs, job_configs) job_configs['configs'] = {'fs.swift.service.sahara.username': '******', 'fs.swift.service.sahara.password': '******', job_utils.DATA_SOURCE_SUBST_NAME: False, job_utils.DATA_SOURCE_SUBST_UUID: True} old_configs = copy.deepcopy(job_configs) self.s_type.prepare_cluster(ds, u.create_cluster(), job_configs=job_configs) # Swift configs should not be overwritten self.assertEqual(old_configs['configs'], job_configs['configs']) job_configs['configs'] = {job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: False} job_configs['proxy_configs'] = {'proxy_username': '******', 'proxy_password': '******', 'proxy_trust_id': 'trustme'} old_configs = copy.deepcopy(job_configs) self.s_type.prepare_cluster(ds, u.create_cluster(), job_configs=job_configs) # Swift configs should be empty and proxy configs should be preserved self.assertEqual(old_configs['configs'], job_configs['configs']) self.assertEqual(old_configs['proxy_configs'], job_configs['proxy_configs']) # If there's no configs do nothing job_configs['configs'] = None old_configs = copy.deepcopy(job_configs) self.s_type.prepare_cluster(ds, u.create_cluster(), job_configs=job_configs) self.assertEqual(old_configs, job_configs) # If it's a FrozenDict do nothing job_configs = { 'configs': { job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: True } } old_configs = copy.deepcopy(job_configs) job_configs = FrozenDict(job_configs) self.s_type.prepare_cluster(ds, u.create_cluster(), job_configs=job_configs) self.assertEqual(old_configs, job_configs)
def _build_workflow_common(self, job_type, streaming=False, proxy=False): if streaming: configs = {'edp.streaming.mapper': '/usr/bin/cat', 'edp.streaming.reducer': '/usr/bin/wc'} configs = {'configs': configs} else: configs = {} job, job_exec = u.create_job_exec(job_type, configs) input_data = u.create_data_source('swift://ex/i') output_data = u.create_data_source('swift://ex/o') data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url} res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, 'hadoop', data_source_urls) if streaming: self.assertIn(""" <streaming> <mapper>/usr/bin/cat</mapper> <reducer>/usr/bin/wc</reducer> </streaming>""", res) self.assertIn(""" <property> <name>mapred.output.dir</name> <value>swift://ex.sahara/o</value> </property>""", res) self.assertIn(""" <property> <name>mapred.input.dir</name> <value>swift://ex.sahara/i</value> </property>""", res) if not proxy: self.assertIn(""" <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property>""", res) self.assertIn(""" <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property>""", res) else: # testing workflow creation with a proxy domain self.override_config('use_domain_for_proxy_users', True) self.override_config("proxy_user_domain_name", 'sahara_proxy_domain') job, job_exec = u.create_job_exec(job_type, proxy=True) res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, 'hadoop', data_source_urls) self.assertIn(""" <property> <name>fs.swift.service.sahara.domain.name</name> <value>sahara_proxy_domain</value> </property> <property> <name>fs.swift.service.sahara.password</name> <value>55555555-6666-7777-8888-999999999999</value> </property> <property> <name>fs.swift.service.sahara.trust.id</name> <value>0123456789abcdef0123456789abcdef</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>job_00000000-1111-2222-3333-4444444444444444</value> </property>""", res)
def test_build_workflow_swift_configs(self, job_binary): # Test that swift configs come from either input or output data sources job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={}) job_binary.return_value = {"name": "script.pig"} input_data = u.create_data_source('swift://ex/i') output_data = u.create_data_source('hdfs://user/hadoop/out') data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url} res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, 'hadoop', data_source_urls) self.assertIn(""" <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration>""", res) input_data = u.create_data_source('hdfs://user/hadoop/in') output_data = u.create_data_source('swift://ex/o') data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url} res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, 'hadoop', data_source_urls) self.assertIn(""" <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration>""", res) job, job_exec = u.create_job_exec( edp.JOB_TYPE_PIG, configs={'configs': {'dummy': 'value'}}) input_data = u.create_data_source('hdfs://user/hadoop/in') output_data = u.create_data_source('hdfs://user/hadoop/out') data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url} res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, 'hadoop', data_source_urls) self.assertIn(""" <configuration> <property> <name>dummy</name> <value>value</value> </property> </configuration>""", res)
def test_resolve_data_source_refs(self, data_source_get_all, ctx): ctx.return_value = 'dummy' name_ref = job_utils.DATA_SOURCE_PREFIX + 'input' job_exec_id = six.text_type(uuid.uuid4()) input_url = "swift://container/input" input = u.create_data_source(input_url, name="input", id=six.text_type(uuid.uuid4())) output = u.create_data_source("swift://container/output.%JOB_EXEC_ID%", name="output", id=six.text_type(uuid.uuid4())) output_url = "swift://container/output." + job_exec_id by_name = {'input': input, 'output': output} by_id = {input.id: input, output.id: output} # Pretend to be the database def _get_all(ctx, **kwargs): name = kwargs.get('name') if name in by_name: name_list = [by_name[name]] else: name_list = [] id = kwargs.get('id') if id in by_id: id_list = [by_id[id]] else: id_list = [] return list(set(name_list + id_list)) data_source_get_all.side_effect = _get_all job_configs = { 'configs': { job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: True }, 'args': [name_ref, output.id, input.id] } urls = {} ds, nc = job_utils.resolve_data_source_references( job_configs, job_exec_id, urls) self.assertEqual(2, len(ds)) self.assertEqual([input.url, output_url, input.url], nc['args']) # Swift configs should be filled in since they were blank self.assertEqual(input.credentials['user'], nc['configs']['fs.swift.service.sahara.username']) self.assertEqual(input.credentials['password'], nc['configs']['fs.swift.service.sahara.password']) self.assertEqual(2, len(urls)) self.assertItemsEqual( { input.id: (input_url, input_url), output.id: (output_url, output_url) }, urls) job_configs['configs'] = { 'fs.swift.service.sahara.username': '******', 'fs.swift.service.sahara.password': '******', job_utils.DATA_SOURCE_SUBST_NAME: False, job_utils.DATA_SOURCE_SUBST_UUID: True } ds, nc = job_utils.resolve_data_source_references( job_configs, job_exec_id, {}) self.assertEqual(2, len(ds)) self.assertEqual([name_ref, output_url, input.url], nc['args']) # Swift configs should not be overwritten self.assertEqual(job_configs['configs'], nc['configs']) job_configs['configs'] = { job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: False } job_configs['proxy_configs'] = { 'proxy_username': '******', 'proxy_password': '******', 'proxy_trust_id': 'trustme' } ds, nc = job_utils.resolve_data_source_references( job_configs, job_exec_id, {}) self.assertEqual(1, len(ds)) self.assertEqual([input.url, output.id, input.id], nc['args']) # Swift configs should be empty and proxy configs should be preserved self.assertEqual(job_configs['configs'], nc['configs']) self.assertEqual(job_configs['proxy_configs'], nc['proxy_configs']) # Substitution not enabled job_configs['configs'] = { job_utils.DATA_SOURCE_SUBST_NAME: False, job_utils.DATA_SOURCE_SUBST_UUID: False } ds, nc = job_utils.resolve_data_source_references( job_configs, job_exec_id, {}) self.assertEqual(0, len(ds)) self.assertEqual(job_configs['args'], nc['args']) self.assertEqual(job_configs['configs'], nc['configs']) # Substitution enabled but no values to modify job_configs['configs'] = { job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: True } job_configs['args'] = ['val1', 'val2', 'val3'] ds, nc = job_utils.resolve_data_source_references( job_configs, job_exec_id, {}) self.assertEqual(0, len(ds)) self.assertEqual(nc['args'], job_configs['args']) self.assertEqual(nc['configs'], job_configs['configs'])
def test_prepare_cluster(self, ctx): ctx.return_value = 'dummy' ds_url = "swift://container/input" ds = u.create_data_source(ds_url, name="data_source", id=uuidutils.generate_uuid()) job_configs = { 'configs': { job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: True } } old_configs = copy.deepcopy(job_configs) self.s_type.prepare_cluster(ds, u.create_cluster(), job_configs=job_configs) # Swift configs should be filled in since they were blank self.assertEqual( ds.credentials['user'], job_configs['configs']['fs.swift.service.sahara.username']) self.assertEqual( ds.credentials['password'], job_configs['configs']['fs.swift.service.sahara.password']) self.assertNotEqual(old_configs, job_configs) job_configs['configs'] = { 'fs.swift.service.sahara.username': '******', 'fs.swift.service.sahara.password': '******', job_utils.DATA_SOURCE_SUBST_NAME: False, job_utils.DATA_SOURCE_SUBST_UUID: True } old_configs = copy.deepcopy(job_configs) self.s_type.prepare_cluster(ds, u.create_cluster(), job_configs=job_configs) # Swift configs should not be overwritten self.assertEqual(old_configs['configs'], job_configs['configs']) job_configs['configs'] = { job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: False } job_configs['proxy_configs'] = { 'proxy_username': '******', 'proxy_password': '******', 'proxy_trust_id': 'trustme' } old_configs = copy.deepcopy(job_configs) self.s_type.prepare_cluster(ds, u.create_cluster(), job_configs=job_configs) # Swift configs should be empty and proxy configs should be preserved self.assertEqual(old_configs['configs'], job_configs['configs']) self.assertEqual(old_configs['proxy_configs'], job_configs['proxy_configs']) # If there's no configs do nothing job_configs['configs'] = None old_configs = copy.deepcopy(job_configs) self.s_type.prepare_cluster(ds, u.create_cluster(), job_configs=job_configs) self.assertEqual(old_configs, job_configs) # If it's a FrozenDict do nothing job_configs = { 'configs': { job_utils.DATA_SOURCE_SUBST_NAME: True, job_utils.DATA_SOURCE_SUBST_UUID: True } } old_configs = copy.deepcopy(job_configs) job_configs = FrozenDict(job_configs) self.s_type.prepare_cluster(ds, u.create_cluster(), job_configs=job_configs) self.assertEqual(old_configs, job_configs)
def _build_workflow_common(self, job_type, streaming=False, proxy=False): if streaming: configs = {"edp.streaming.mapper": "/usr/bin/cat", "edp.streaming.reducer": "/usr/bin/wc"} configs = {"configs": configs} else: configs = {} job, job_exec = u.create_job_exec(job_type, configs) input_data = u.create_data_source("swift://ex/i") output_data = u.create_data_source("swift://ex/o") data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url} res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls ) if streaming: self.assertIn( """ <streaming> <mapper>/usr/bin/cat</mapper> <reducer>/usr/bin/wc</reducer> </streaming>""", res, ) self.assertIn( """ <property> <name>mapred.output.dir</name> <value>swift://ex.sahara/o</value> </property>""", res, ) self.assertIn( """ <property> <name>mapred.input.dir</name> <value>swift://ex.sahara/i</value> </property>""", res, ) if not proxy: self.assertIn( """ <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property>""", res, ) self.assertIn( """ <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property>""", res, ) else: # testing workflow creation with a proxy domain self.override_config("use_domain_for_proxy_users", True) self.override_config("proxy_user_domain_name", "sahara_proxy_domain") job, job_exec = u.create_job_exec(job_type, proxy=True) res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls ) self.assertIn( """ <property> <name>fs.swift.service.sahara.domain.name</name> <value>sahara_proxy_domain</value> </property> <property> <name>fs.swift.service.sahara.password</name> <value>55555555-6666-7777-8888-999999999999</value> </property> <property> <name>fs.swift.service.sahara.trust.id</name> <value>0123456789abcdef0123456789abcdef</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>job_00000000-1111-2222-3333-4444444444444444</value> </property>""", res, )
def _build_workflow_common(self, job_type, streaming=False, proxy=False): if streaming: configs = {'edp.streaming.mapper': '/usr/bin/cat', 'edp.streaming.reducer': '/usr/bin/wc'} configs = {'configs': configs} else: configs = {} job, job_exec = u.create_job_exec(job_type, configs) input_data = u.create_data_source('swift://ex/i') output_data = u.create_data_source('swift://ex/o') res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec, input_data, output_data, 'hadoop') if streaming: self.assertIn(""" <streaming> <mapper>/usr/bin/cat</mapper> <reducer>/usr/bin/wc</reducer> </streaming>""", res) self.assertIn(""" <property> <name>mapred.output.dir</name> <value>swift://ex.sahara/o</value> </property>""", res) self.assertIn(""" <property> <name>mapred.input.dir</name> <value>swift://ex.sahara/i</value> </property>""", res) if not proxy: self.assertIn(""" <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property>""", res) self.assertIn(""" <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property>""", res) else: # testing workflow creation with a proxy domain self.override_config('use_domain_for_proxy_users', True) self.override_config("proxy_user_domain_name", 'sahara_proxy_domain') job, job_exec = u.create_job_exec(job_type, proxy=True) res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec, input_data, output_data, 'hadoop') self.assertIn(""" <property> <name>fs.swift.service.sahara.domain.name</name> <value>sahara_proxy_domain</value> </property> <property> <name>fs.swift.service.sahara.password</name> <value>55555555-6666-7777-8888-999999999999</value> </property> <property> <name>fs.swift.service.sahara.trust.id</name> <value>0123456789abcdef0123456789abcdef</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>job_00000000-1111-2222-3333-4444444444444444</value> </property>""", res)
def test_build_workflow_swift_configs(self, job_binary): # Test that swift configs come from either input or output data sources job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={}) job_binary.return_value = {"name": "script.pig"} input_data = u.create_data_source("swift://ex/i") output_data = u.create_data_source("hdfs://user/hadoop/out") data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url} res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls ) self.assertIn( """ <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration>""", res, ) input_data = u.create_data_source("hdfs://user/hadoop/in") output_data = u.create_data_source("swift://ex/o") data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url} res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls ) self.assertIn( """ <configuration> <property> <name>fs.swift.service.sahara.password</name> <value>admin1</value> </property> <property> <name>fs.swift.service.sahara.username</name> <value>admin</value> </property> </configuration>""", res, ) job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={"configs": {"dummy": "value"}}) input_data = u.create_data_source("hdfs://user/hadoop/in") output_data = u.create_data_source("hdfs://user/hadoop/out") data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url} res = workflow_factory.get_workflow_xml( job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls ) self.assertIn( """ <configuration> <property> <name>dummy</name> <value>value</value> </property> </configuration>""", res, )