Exemple #1
0
    def _build_workflow_with_conf_common(self, job_type):

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')

        job, job_exec = u.create_job_exec(job_type,
                                          configs={"configs": {'c': 'f'}})

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        self.assertIn("""
        <property>
          <name>c</name>
          <value>f</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.sahara/i</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.sahara/o</value>
        </property>""", res)
    def test_build_workflow_for_job_hive(self, job_binary):

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, configs={})
        job_binary.return_value = {"name": "script.q"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')

        res = workflow_factory.get_workflow_xml(job, u.create_cluster(),
                                                job_exec, input_data,
                                                output_data, 'hadoop')

        doc = xml.parseString(res)
        hive = doc.getElementsByTagName('hive')[0]
        self.assertEqual(xmlutils.get_text_from_node(hive, 'job-xml'),
                         '/user/hadoop/conf/hive-site.xml')

        configuration = hive.getElementsByTagName('configuration')
        properties = xmlutils.get_property_dict(configuration[0])
        self.assertEqual(
            {
                'fs.swift.service.sahara.password': '******',
                'fs.swift.service.sahara.username': '******'
            }, properties)

        self.assertEqual(xmlutils.get_text_from_node(hive, 'script'),
                         'script.q')

        params = xmlutils.get_param_dict(hive)
        self.assertEqual(
            {
                'INPUT': 'swift://ex.sahara/i',
                'OUTPUT': 'swift://ex.sahara/o'
            }, params)

        # testing workflow creation with a proxy domain
        self.override_config('use_domain_for_proxy_users', True)
        self.override_config("proxy_user_domain_name", 'sahara_proxy_domain')

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, proxy=True)

        res = workflow_factory.get_workflow_xml(job, u.create_cluster(),
                                                job_exec, input_data,
                                                output_data, 'hadoop')

        doc = xml.parseString(res)
        hive = doc.getElementsByTagName('hive')[0]
        configuration = hive.getElementsByTagName('configuration')
        properties = xmlutils.get_property_dict(configuration[0])
        self.assertEqual(
            {
                'fs.swift.service.sahara.domain.name':
                'sahara_proxy_domain',
                'fs.swift.service.sahara.trust.id':
                '0123456789abcdef0123456789abcdef',
                'fs.swift.service.sahara.password':
                '******',
                'fs.swift.service.sahara.username':
                '******'
            }, properties)
Exemple #3
0
    def test_build_workflow_for_job_pig(self, job_binary):

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={})
        job_binary.return_value = {"name": "script.pig"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <param>INPUT=swift://ex.sahara/i</param>
      <param>OUTPUT=swift://ex.sahara/o</param>""", res)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        self.assertIn("<script>script.pig</script>", res)

        # testing workflow creation with a proxy domain
        self.override_config('use_domain_for_proxy_users', True)
        self.override_config("proxy_user_domain_name", 'sahara_proxy_domain')
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, proxy=True)

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.domain.name</name>
          <value>sahara_proxy_domain</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>55555555-6666-7777-8888-999999999999</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.trust.id</name>
          <value>0123456789abcdef0123456789abcdef</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>job_00000000-1111-2222-3333-4444444444444444</value>
        </property>
      </configuration>""", res)
Exemple #4
0
    def test_build_workflow_for_job_hive(self, job_binary):

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE)
        job_binary.return_value = {"name": "script.q"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        self.assertIn("""
      <job-xml>/user/hadoop/conf/hive-site.xml</job-xml>
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>
      <script>script.q</script>
      <param>INPUT=swift://ex.sahara/i</param>
      <param>OUTPUT=swift://ex.sahara/o</param>""", res)
Exemple #5
0
    def test_build_workflow_for_job_pig(self, job_binary):

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={})
        job_binary.return_value = {"name": "script.pig"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <param>INPUT=swift://ex.sahara/i</param>
      <param>OUTPUT=swift://ex.sahara/o</param>""", res)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        self.assertIn("<script>script.pig</script>", res)

        # testing workflow creation with a proxy domain
        self.override_config('use_domain_for_proxy_users', True)
        self.override_config("proxy_user_domain_name", 'sahara_proxy_domain')
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, proxy=True)

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.domain.name</name>
          <value>sahara_proxy_domain</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>55555555-6666-7777-8888-999999999999</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.trust.id</name>
          <value>0123456789abcdef0123456789abcdef</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>job_00000000-1111-2222-3333-4444444444444444</value>
        </property>
      </configuration>""", res)
Exemple #6
0
    def test_build_workflow_for_job_hive(self, job_binary):

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, configs={})
        job_binary.return_value = {"name": "script.q"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        doc = xml.parseString(res)
        hive = doc.getElementsByTagName('hive')[0]
        self.assertEqual('/user/hadoop/conf/hive-site.xml',
                         xmlutils.get_text_from_node(hive, 'job-xml'))

        configuration = hive.getElementsByTagName('configuration')
        properties = xmlutils.get_property_dict(configuration[0])
        self.assertEqual({'fs.swift.service.sahara.password': '******',
                          'fs.swift.service.sahara.username': '******'},
                         properties)

        self.assertEqual('script.q',
                         xmlutils.get_text_from_node(hive, 'script'))

        params = xmlutils.get_param_dict(hive)
        self.assertEqual({'INPUT': 'swift://ex.sahara/i',
                          'OUTPUT': 'swift://ex.sahara/o'}, params)

        # testing workflow creation with a proxy domain
        self.override_config('use_domain_for_proxy_users', True)
        self.override_config("proxy_user_domain_name", 'sahara_proxy_domain')

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, proxy=True)

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        doc = xml.parseString(res)
        hive = doc.getElementsByTagName('hive')[0]
        configuration = hive.getElementsByTagName('configuration')
        properties = xmlutils.get_property_dict(configuration[0])
        self.assertEqual({
            'fs.swift.service.sahara.domain.name':
            'sahara_proxy_domain',

            'fs.swift.service.sahara.trust.id':
            '0123456789abcdef0123456789abcdef',

            'fs.swift.service.sahara.password':
            '******',

            'fs.swift.service.sahara.username':
            '******'}, properties)
Exemple #7
0
    def test_build_workflow_for_job_hive(self, job_binary):

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, configs={})
        job_binary.return_value = {"name": "script.q"}

        input_data = u.create_data_source("swift://ex/i")
        output_data = u.create_data_source("swift://ex/o")
        data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
        )

        doc = xml.parseString(res)
        hive = doc.getElementsByTagName("hive")[0]
        self.assertEqual("/user/hadoop/conf/hive-site.xml", xmlutils.get_text_from_node(hive, "job-xml"))

        configuration = hive.getElementsByTagName("configuration")
        properties = xmlutils.get_property_dict(configuration[0])
        self.assertEqual(
            {"fs.swift.service.sahara.password": "******", "fs.swift.service.sahara.username": "******"}, properties
        )

        self.assertEqual("script.q", xmlutils.get_text_from_node(hive, "script"))

        params = xmlutils.get_param_dict(hive)
        self.assertEqual({"INPUT": "swift://ex.sahara/i", "OUTPUT": "swift://ex.sahara/o"}, params)

        # testing workflow creation with a proxy domain
        self.override_config("use_domain_for_proxy_users", True)
        self.override_config("proxy_user_domain_name", "sahara_proxy_domain")

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, proxy=True)

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
        )

        doc = xml.parseString(res)
        hive = doc.getElementsByTagName("hive")[0]
        configuration = hive.getElementsByTagName("configuration")
        properties = xmlutils.get_property_dict(configuration[0])
        self.assertEqual(
            {
                "fs.swift.service.sahara.domain.name": "sahara_proxy_domain",
                "fs.swift.service.sahara.trust.id": "0123456789abcdef0123456789abcdef",
                "fs.swift.service.sahara.password": "******",
                "fs.swift.service.sahara.username": "******",
            },
            properties,
        )
Exemple #8
0
    def _build_workflow_common(self, job_type, streaming=False):
        if streaming:
            configs = {'edp.streaming.mapper': '/usr/bin/cat',
                       'edp.streaming.reducer': '/usr/bin/wc'}
            configs = {'configs': configs}
        else:
            configs = {}

        job, job_exec = u.create_job_exec(job_type, configs)

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        if streaming:
            self.assertIn("""
      <streaming>
        <mapper>/usr/bin/cat</mapper>
        <reducer>/usr/bin/wc</reducer>
      </streaming>""", res)

        self.assertIn("""
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.sahara/o</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.sahara/i</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>""", res)
Exemple #9
0
    def test_build_workflow_swift_configs(self, job_binary):

        # Test that swift configs come from either input or output data sources
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={})
        job_binary.return_value = {"name": "script.pig"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('hdfs://user/hadoop/out')

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        input_data = u.create_data_source('hdfs://user/hadoop/in')
        output_data = u.create_data_source('swift://ex/o')

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        job, job_exec = u.create_job_exec(
            edp.JOB_TYPE_PIG, configs={'configs': {'dummy': 'value'}})
        input_data = u.create_data_source('hdfs://user/hadoop/in')
        output_data = u.create_data_source('hdfs://user/hadoop/out')

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        self.assertIn("""
      <configuration>
        <property>
          <name>dummy</name>
          <value>value</value>
        </property>
      </configuration>""", res)
    def test_resolve_data_source_refs(self, data_source_get_all, ctx):

        ctx.return_value = 'dummy'

        name_ref = job_utils.DATA_SOURCE_PREFIX+'input'

        input = u.create_data_source("swift://container/input",
                                     name="input",
                                     id=six.text_type(uuid.uuid4()))

        output = u.create_data_source("swift://container/output",
                                      name="output",
                                      id=six.text_type(uuid.uuid4()))

        by_name = {'input': input,
                   'output': output}

        by_id = {input.id: input,
                 output.id: output}

        # Pretend to be the database
        def _get_all(ctx, **kwargs):
            name = kwargs.get('name')
            if name in by_name:
                name_list = [by_name[name]]
            else:
                name_list = []

            id = kwargs.get('id')
            if id in by_id:
                id_list = [by_id[id]]
            else:
                id_list = []
            return list(set(name_list + id_list))

        data_source_get_all.side_effect = _get_all

        job_configs = {
            'configs': {
                job_utils.DATA_SOURCE_SUBST_NAME: True,
                job_utils.DATA_SOURCE_SUBST_UUID: True},
            'args': [name_ref, output.id, input.id]}

        ds, nc = job_utils.resolve_data_source_references(job_configs)
        self.assertEqual(2, len(ds))
        self.assertEqual([input.url, output.url, input.url], nc['args'])
        # Swift configs should be filled in since they were blank
        self.assertEqual(input.credentials['user'],
                         nc['configs']['fs.swift.service.sahara.username'])
        self.assertEqual(input.credentials['password'],
                         nc['configs']['fs.swift.service.sahara.password'])

        job_configs['configs'] = {'fs.swift.service.sahara.username': '******',
                                  'fs.swift.service.sahara.password': '******',
                                  job_utils.DATA_SOURCE_SUBST_NAME: False,
                                  job_utils.DATA_SOURCE_SUBST_UUID: True}
        ds, nc = job_utils.resolve_data_source_references(job_configs)
        self.assertEqual(2, len(ds))
        self.assertEqual([name_ref, output.url, input.url], nc['args'])
        # Swift configs should not be overwritten
        self.assertEqual(job_configs['configs'], nc['configs'])

        job_configs['configs'] = {job_utils.DATA_SOURCE_SUBST_NAME: True,
                                  job_utils.DATA_SOURCE_SUBST_UUID: False}
        job_configs['proxy_configs'] = {'proxy_username': '******',
                                        'proxy_password': '******',
                                        'proxy_trust_id': 'trustme'}
        ds, nc = job_utils.resolve_data_source_references(job_configs)
        self.assertEqual(1, len(ds))
        self.assertEqual([input.url, output.id, input.id], nc['args'])

        # Swift configs should be empty and proxy configs should be preserved
        self.assertEqual(job_configs['configs'], nc['configs'])
        self.assertEqual(job_configs['proxy_configs'], nc['proxy_configs'])

        # Substitution not enabled
        job_configs['configs'] = {job_utils.DATA_SOURCE_SUBST_NAME: False,
                                  job_utils.DATA_SOURCE_SUBST_UUID: False}
        ds, nc = job_utils.resolve_data_source_references(job_configs)
        self.assertEqual(0, len(ds))
        self.assertEqual(job_configs['args'], nc['args'])
        self.assertEqual(job_configs['configs'], nc['configs'])

        # Substitution enabled but no values to modify
        job_configs['configs'] = {job_utils.DATA_SOURCE_SUBST_NAME: True,
                                  job_utils.DATA_SOURCE_SUBST_UUID: True}
        job_configs['args'] = ['val1', 'val2', 'val3']
        ds, nc = job_utils.resolve_data_source_references(job_configs)
        self.assertEqual(0, len(ds))
        self.assertEqual(job_configs['args'], nc['args'])
        self.assertEqual(job_configs['configs'], nc['configs'])
Exemple #11
0
    def test_resolve_data_source_refs(self, data_source_get_all, ctx):

        ctx.return_value = 'dummy'

        name_ref = job_utils.DATA_SOURCE_PREFIX + 'input'
        job_exec_id = uuidutils.generate_uuid()

        input_url = "swift://container/input"
        input = u.create_data_source(input_url,
                                     name="input",
                                     id=uuidutils.generate_uuid())

        output = u.create_data_source("swift://container/output.%JOB_EXEC_ID%",
                                      name="output",
                                      id=uuidutils.generate_uuid())
        output_url = "swift://container/output." + job_exec_id

        by_name = {'input': input, 'output': output}

        by_id = {input.id: input, output.id: output}

        # Pretend to be the database
        def _get_all(ctx, **kwargs):
            name = kwargs.get('name')
            if name in by_name:
                name_list = [by_name[name]]
            else:
                name_list = []

            id = kwargs.get('id')
            if id in by_id:
                id_list = [by_id[id]]
            else:
                id_list = []
            return list(set(name_list + id_list))

        data_source_get_all.side_effect = _get_all

        job_configs = {
            'configs': {
                job_utils.DATA_SOURCE_SUBST_NAME: True,
                job_utils.DATA_SOURCE_SUBST_UUID: True
            },
            'args': [name_ref, output.id, input.id]
        }
        urls = {}
        ds, nc = job_utils.resolve_data_source_references(
            job_configs, job_exec_id, urls)
        self.assertEqual(2, len(ds))
        self.assertEqual([input.url, output_url, input.url], nc['args'])

        # Substitution not enabled
        job_configs['configs'] = {
            job_utils.DATA_SOURCE_SUBST_NAME: False,
            job_utils.DATA_SOURCE_SUBST_UUID: False
        }
        ds, nc = job_utils.resolve_data_source_references(
            job_configs, job_exec_id, {})
        self.assertEqual(0, len(ds))
        self.assertEqual(job_configs['args'], nc['args'])
        self.assertEqual(job_configs['configs'], nc['configs'])

        # Substitution enabled but no values to modify
        job_configs['configs'] = {
            job_utils.DATA_SOURCE_SUBST_NAME: True,
            job_utils.DATA_SOURCE_SUBST_UUID: True
        }
        job_configs['args'] = ['val1', 'val2', 'val3']
        ds, nc = job_utils.resolve_data_source_references(
            job_configs, job_exec_id, {})
        self.assertEqual(0, len(ds))
        self.assertEqual(nc['args'], job_configs['args'])
        self.assertEqual(nc['configs'], job_configs['configs'])
Exemple #12
0
    def test_prepare_cluster(self, ctx):

        ctx.return_value = 'dummy'

        ds_url = "swift://container/input"
        ds = u.create_data_source(ds_url,
                                  name="data_source",
                                  id=uuidutils.generate_uuid())

        job_configs = {
            'configs': {
                job_utils.DATA_SOURCE_SUBST_NAME: True,
                job_utils.DATA_SOURCE_SUBST_UUID: True
                }
            }

        old_configs = copy.deepcopy(job_configs)

        self.s_type.prepare_cluster(ds, u.create_cluster(),
                                    job_configs=job_configs)
        # Swift configs should be filled in since they were blank
        self.assertEqual(ds.credentials['user'],
                         job_configs['configs']
                         ['fs.swift.service.sahara.username'])
        self.assertEqual(ds.credentials['password'],
                         job_configs['configs']
                         ['fs.swift.service.sahara.password'])

        self.assertNotEqual(old_configs, job_configs)

        job_configs['configs'] = {'fs.swift.service.sahara.username': '******',
                                  'fs.swift.service.sahara.password': '******',
                                  job_utils.DATA_SOURCE_SUBST_NAME: False,
                                  job_utils.DATA_SOURCE_SUBST_UUID: True}
        old_configs = copy.deepcopy(job_configs)

        self.s_type.prepare_cluster(ds, u.create_cluster(),
                                    job_configs=job_configs)
        # Swift configs should not be overwritten
        self.assertEqual(old_configs['configs'], job_configs['configs'])

        job_configs['configs'] = {job_utils.DATA_SOURCE_SUBST_NAME: True,
                                  job_utils.DATA_SOURCE_SUBST_UUID: False}
        job_configs['proxy_configs'] = {'proxy_username': '******',
                                        'proxy_password': '******',
                                        'proxy_trust_id': 'trustme'}
        old_configs = copy.deepcopy(job_configs)
        self.s_type.prepare_cluster(ds, u.create_cluster(),
                                    job_configs=job_configs)

        # Swift configs should be empty and proxy configs should be preserved
        self.assertEqual(old_configs['configs'], job_configs['configs'])
        self.assertEqual(old_configs['proxy_configs'],
                         job_configs['proxy_configs'])

        # If there's no configs do nothing
        job_configs['configs'] = None
        old_configs = copy.deepcopy(job_configs)

        self.s_type.prepare_cluster(ds, u.create_cluster(),
                                    job_configs=job_configs)
        self.assertEqual(old_configs, job_configs)

        # If it's a FrozenDict do nothing
        job_configs = {
            'configs': {
                job_utils.DATA_SOURCE_SUBST_NAME: True,
                job_utils.DATA_SOURCE_SUBST_UUID: True
                }
            }

        old_configs = copy.deepcopy(job_configs)
        job_configs = FrozenDict(job_configs)

        self.s_type.prepare_cluster(ds, u.create_cluster(),
                                    job_configs=job_configs)
        self.assertEqual(old_configs, job_configs)
Exemple #13
0
    def _build_workflow_common(self, job_type, streaming=False, proxy=False):
        if streaming:
            configs = {'edp.streaming.mapper': '/usr/bin/cat',
                       'edp.streaming.reducer': '/usr/bin/wc'}
            configs = {'configs': configs}
        else:
            configs = {}

        job, job_exec = u.create_job_exec(job_type, configs)

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        if streaming:
            self.assertIn("""
      <streaming>
        <mapper>/usr/bin/cat</mapper>
        <reducer>/usr/bin/wc</reducer>
      </streaming>""", res)

        self.assertIn("""
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.sahara/o</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.sahara/i</value>
        </property>""", res)

        if not proxy:
            self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>""", res)

            self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>""", res)
        else:
            # testing workflow creation with a proxy domain
            self.override_config('use_domain_for_proxy_users', True)
            self.override_config("proxy_user_domain_name",
                                 'sahara_proxy_domain')
            job, job_exec = u.create_job_exec(job_type, proxy=True)

            res = workflow_factory.get_workflow_xml(
                job, u.create_cluster(), job_exec.job_configs,
                input_data, output_data, 'hadoop', data_source_urls)

            self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.domain.name</name>
          <value>sahara_proxy_domain</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>55555555-6666-7777-8888-999999999999</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.trust.id</name>
          <value>0123456789abcdef0123456789abcdef</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>job_00000000-1111-2222-3333-4444444444444444</value>
        </property>""", res)
Exemple #14
0
    def test_build_workflow_swift_configs(self, job_binary):

        # Test that swift configs come from either input or output data sources
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={})
        job_binary.return_value = {"name": "script.pig"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('hdfs://user/hadoop/out')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        input_data = u.create_data_source('hdfs://user/hadoop/in')
        output_data = u.create_data_source('swift://ex/o')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        job, job_exec = u.create_job_exec(
            edp.JOB_TYPE_PIG, configs={'configs': {'dummy': 'value'}})
        input_data = u.create_data_source('hdfs://user/hadoop/in')
        output_data = u.create_data_source('hdfs://user/hadoop/out')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <configuration>
        <property>
          <name>dummy</name>
          <value>value</value>
        </property>
      </configuration>""", res)
Exemple #15
0
    def test_resolve_data_source_refs(self, data_source_get_all, ctx):

        ctx.return_value = 'dummy'

        name_ref = job_utils.DATA_SOURCE_PREFIX + 'input'
        job_exec_id = six.text_type(uuid.uuid4())

        input_url = "swift://container/input"
        input = u.create_data_source(input_url,
                                     name="input",
                                     id=six.text_type(uuid.uuid4()))

        output = u.create_data_source("swift://container/output.%JOB_EXEC_ID%",
                                      name="output",
                                      id=six.text_type(uuid.uuid4()))
        output_url = "swift://container/output." + job_exec_id

        by_name = {'input': input, 'output': output}

        by_id = {input.id: input, output.id: output}

        # Pretend to be the database
        def _get_all(ctx, **kwargs):
            name = kwargs.get('name')
            if name in by_name:
                name_list = [by_name[name]]
            else:
                name_list = []

            id = kwargs.get('id')
            if id in by_id:
                id_list = [by_id[id]]
            else:
                id_list = []
            return list(set(name_list + id_list))

        data_source_get_all.side_effect = _get_all

        job_configs = {
            'configs': {
                job_utils.DATA_SOURCE_SUBST_NAME: True,
                job_utils.DATA_SOURCE_SUBST_UUID: True
            },
            'args': [name_ref, output.id, input.id]
        }
        urls = {}
        ds, nc = job_utils.resolve_data_source_references(
            job_configs, job_exec_id, urls)
        self.assertEqual(2, len(ds))
        self.assertEqual([input.url, output_url, input.url], nc['args'])
        # Swift configs should be filled in since they were blank
        self.assertEqual(input.credentials['user'],
                         nc['configs']['fs.swift.service.sahara.username'])
        self.assertEqual(input.credentials['password'],
                         nc['configs']['fs.swift.service.sahara.password'])
        self.assertEqual(2, len(urls))
        self.assertItemsEqual(
            {
                input.id: (input_url, input_url),
                output.id: (output_url, output_url)
            }, urls)

        job_configs['configs'] = {
            'fs.swift.service.sahara.username': '******',
            'fs.swift.service.sahara.password': '******',
            job_utils.DATA_SOURCE_SUBST_NAME: False,
            job_utils.DATA_SOURCE_SUBST_UUID: True
        }
        ds, nc = job_utils.resolve_data_source_references(
            job_configs, job_exec_id, {})
        self.assertEqual(2, len(ds))
        self.assertEqual([name_ref, output_url, input.url], nc['args'])
        # Swift configs should not be overwritten
        self.assertEqual(job_configs['configs'], nc['configs'])

        job_configs['configs'] = {
            job_utils.DATA_SOURCE_SUBST_NAME: True,
            job_utils.DATA_SOURCE_SUBST_UUID: False
        }
        job_configs['proxy_configs'] = {
            'proxy_username': '******',
            'proxy_password': '******',
            'proxy_trust_id': 'trustme'
        }
        ds, nc = job_utils.resolve_data_source_references(
            job_configs, job_exec_id, {})
        self.assertEqual(1, len(ds))
        self.assertEqual([input.url, output.id, input.id], nc['args'])

        # Swift configs should be empty and proxy configs should be preserved
        self.assertEqual(job_configs['configs'], nc['configs'])
        self.assertEqual(job_configs['proxy_configs'], nc['proxy_configs'])

        # Substitution not enabled
        job_configs['configs'] = {
            job_utils.DATA_SOURCE_SUBST_NAME: False,
            job_utils.DATA_SOURCE_SUBST_UUID: False
        }
        ds, nc = job_utils.resolve_data_source_references(
            job_configs, job_exec_id, {})
        self.assertEqual(0, len(ds))
        self.assertEqual(job_configs['args'], nc['args'])
        self.assertEqual(job_configs['configs'], nc['configs'])

        # Substitution enabled but no values to modify
        job_configs['configs'] = {
            job_utils.DATA_SOURCE_SUBST_NAME: True,
            job_utils.DATA_SOURCE_SUBST_UUID: True
        }
        job_configs['args'] = ['val1', 'val2', 'val3']
        ds, nc = job_utils.resolve_data_source_references(
            job_configs, job_exec_id, {})
        self.assertEqual(0, len(ds))
        self.assertEqual(nc['args'], job_configs['args'])
        self.assertEqual(nc['configs'], job_configs['configs'])
Exemple #16
0
    def test_prepare_cluster(self, ctx):

        ctx.return_value = 'dummy'

        ds_url = "swift://container/input"
        ds = u.create_data_source(ds_url,
                                  name="data_source",
                                  id=uuidutils.generate_uuid())

        job_configs = {
            'configs': {
                job_utils.DATA_SOURCE_SUBST_NAME: True,
                job_utils.DATA_SOURCE_SUBST_UUID: True
            }
        }

        old_configs = copy.deepcopy(job_configs)

        self.s_type.prepare_cluster(ds,
                                    u.create_cluster(),
                                    job_configs=job_configs)
        # Swift configs should be filled in since they were blank
        self.assertEqual(
            ds.credentials['user'],
            job_configs['configs']['fs.swift.service.sahara.username'])
        self.assertEqual(
            ds.credentials['password'],
            job_configs['configs']['fs.swift.service.sahara.password'])

        self.assertNotEqual(old_configs, job_configs)

        job_configs['configs'] = {
            'fs.swift.service.sahara.username': '******',
            'fs.swift.service.sahara.password': '******',
            job_utils.DATA_SOURCE_SUBST_NAME: False,
            job_utils.DATA_SOURCE_SUBST_UUID: True
        }
        old_configs = copy.deepcopy(job_configs)

        self.s_type.prepare_cluster(ds,
                                    u.create_cluster(),
                                    job_configs=job_configs)
        # Swift configs should not be overwritten
        self.assertEqual(old_configs['configs'], job_configs['configs'])

        job_configs['configs'] = {
            job_utils.DATA_SOURCE_SUBST_NAME: True,
            job_utils.DATA_SOURCE_SUBST_UUID: False
        }
        job_configs['proxy_configs'] = {
            'proxy_username': '******',
            'proxy_password': '******',
            'proxy_trust_id': 'trustme'
        }
        old_configs = copy.deepcopy(job_configs)
        self.s_type.prepare_cluster(ds,
                                    u.create_cluster(),
                                    job_configs=job_configs)

        # Swift configs should be empty and proxy configs should be preserved
        self.assertEqual(old_configs['configs'], job_configs['configs'])
        self.assertEqual(old_configs['proxy_configs'],
                         job_configs['proxy_configs'])

        # If there's no configs do nothing
        job_configs['configs'] = None
        old_configs = copy.deepcopy(job_configs)

        self.s_type.prepare_cluster(ds,
                                    u.create_cluster(),
                                    job_configs=job_configs)
        self.assertEqual(old_configs, job_configs)

        # If it's a FrozenDict do nothing
        job_configs = {
            'configs': {
                job_utils.DATA_SOURCE_SUBST_NAME: True,
                job_utils.DATA_SOURCE_SUBST_UUID: True
            }
        }

        old_configs = copy.deepcopy(job_configs)
        job_configs = FrozenDict(job_configs)

        self.s_type.prepare_cluster(ds,
                                    u.create_cluster(),
                                    job_configs=job_configs)
        self.assertEqual(old_configs, job_configs)
Exemple #17
0
    def _build_workflow_common(self, job_type, streaming=False, proxy=False):
        if streaming:
            configs = {"edp.streaming.mapper": "/usr/bin/cat", "edp.streaming.reducer": "/usr/bin/wc"}
            configs = {"configs": configs}
        else:
            configs = {}

        job, job_exec = u.create_job_exec(job_type, configs)

        input_data = u.create_data_source("swift://ex/i")
        output_data = u.create_data_source("swift://ex/o")
        data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
        )

        if streaming:
            self.assertIn(
                """
      <streaming>
        <mapper>/usr/bin/cat</mapper>
        <reducer>/usr/bin/wc</reducer>
      </streaming>""",
                res,
            )

        self.assertIn(
            """
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.sahara/o</value>
        </property>""",
            res,
        )

        self.assertIn(
            """
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.sahara/i</value>
        </property>""",
            res,
        )

        if not proxy:
            self.assertIn(
                """
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>""",
                res,
            )

            self.assertIn(
                """
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>""",
                res,
            )
        else:
            # testing workflow creation with a proxy domain
            self.override_config("use_domain_for_proxy_users", True)
            self.override_config("proxy_user_domain_name", "sahara_proxy_domain")
            job, job_exec = u.create_job_exec(job_type, proxy=True)

            res = workflow_factory.get_workflow_xml(
                job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
            )

            self.assertIn(
                """
        <property>
          <name>fs.swift.service.sahara.domain.name</name>
          <value>sahara_proxy_domain</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>55555555-6666-7777-8888-999999999999</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.trust.id</name>
          <value>0123456789abcdef0123456789abcdef</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>job_00000000-1111-2222-3333-4444444444444444</value>
        </property>""",
                res,
            )
Exemple #18
0
    def _build_workflow_common(self, job_type, streaming=False, proxy=False):
        if streaming:
            configs = {'edp.streaming.mapper': '/usr/bin/cat',
                       'edp.streaming.reducer': '/usr/bin/wc'}
            configs = {'configs': configs}
        else:
            configs = {}

        job, job_exec = u.create_job_exec(job_type, configs)

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        if streaming:
            self.assertIn("""
      <streaming>
        <mapper>/usr/bin/cat</mapper>
        <reducer>/usr/bin/wc</reducer>
      </streaming>""", res)

        self.assertIn("""
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.sahara/o</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.sahara/i</value>
        </property>""", res)

        if not proxy:
            self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>""", res)

            self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>""", res)
        else:
            # testing workflow creation with a proxy domain
            self.override_config('use_domain_for_proxy_users', True)
            self.override_config("proxy_user_domain_name",
                                 'sahara_proxy_domain')
            job, job_exec = u.create_job_exec(job_type, proxy=True)

            res = workflow_factory.get_workflow_xml(
                job, u.create_cluster(), job_exec, input_data, output_data,
                'hadoop')

            self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.domain.name</name>
          <value>sahara_proxy_domain</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>55555555-6666-7777-8888-999999999999</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.trust.id</name>
          <value>0123456789abcdef0123456789abcdef</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>job_00000000-1111-2222-3333-4444444444444444</value>
        </property>""", res)
Exemple #19
0
    def test_build_workflow_swift_configs(self, job_binary):

        # Test that swift configs come from either input or output data sources
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={})
        job_binary.return_value = {"name": "script.pig"}

        input_data = u.create_data_source("swift://ex/i")
        output_data = u.create_data_source("hdfs://user/hadoop/out")
        data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
        )

        self.assertIn(
            """
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""",
            res,
        )

        input_data = u.create_data_source("hdfs://user/hadoop/in")
        output_data = u.create_data_source("swift://ex/o")
        data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
        )

        self.assertIn(
            """
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""",
            res,
        )

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={"configs": {"dummy": "value"}})
        input_data = u.create_data_source("hdfs://user/hadoop/in")
        output_data = u.create_data_source("hdfs://user/hadoop/out")
        data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
        )

        self.assertIn(
            """
      <configuration>
        <property>
          <name>dummy</name>
          <value>value</value>
        </property>
      </configuration>""",
            res,
        )