Example #1
0
    def test_build_workflow_for_job_pig(self, job_binary):

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={})
        job_binary.return_value = {"name": "script.pig"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <param>INPUT=swift://ex.sahara/i</param>
      <param>OUTPUT=swift://ex.sahara/o</param>""", res)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        self.assertIn("<script>script.pig</script>", res)

        # testing workflow creation with a proxy domain
        self.override_config('use_domain_for_proxy_users', True)
        self.override_config("proxy_user_domain_name", 'sahara_proxy_domain')
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, proxy=True)

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.domain.name</name>
          <value>sahara_proxy_domain</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>55555555-6666-7777-8888-999999999999</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.trust.id</name>
          <value>0123456789abcdef0123456789abcdef</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>job_00000000-1111-2222-3333-4444444444444444</value>
        </property>
      </configuration>""", res)
Example #2
0
    def test_build_workflow_for_job_pig(self, job_binary):

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={})
        job_binary.return_value = {"name": "script.pig"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <param>INPUT=swift://ex.sahara/i</param>
      <param>OUTPUT=swift://ex.sahara/o</param>""", res)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        self.assertIn("<script>script.pig</script>", res)

        # testing workflow creation with a proxy domain
        self.override_config('use_domain_for_proxy_users', True)
        self.override_config("proxy_user_domain_name", 'sahara_proxy_domain')
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, proxy=True)

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.domain.name</name>
          <value>sahara_proxy_domain</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>55555555-6666-7777-8888-999999999999</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.trust.id</name>
          <value>0123456789abcdef0123456789abcdef</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>job_00000000-1111-2222-3333-4444444444444444</value>
        </property>
      </configuration>""", res)
Example #3
0
    def test_build_workflow_swift_configs(self, job_binary):

        # Test that swift configs come from either input or output data sources
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={})
        job_binary.return_value = {"name": "script.pig"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('hdfs://user/hadoop/out')

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        input_data = u.create_data_source('hdfs://user/hadoop/in')
        output_data = u.create_data_source('swift://ex/o')

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        job, job_exec = u.create_job_exec(
            edp.JOB_TYPE_PIG, configs={'configs': {'dummy': 'value'}})
        input_data = u.create_data_source('hdfs://user/hadoop/in')
        output_data = u.create_data_source('hdfs://user/hadoop/out')

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        self.assertIn("""
      <configuration>
        <property>
          <name>dummy</name>
          <value>value</value>
        </property>
      </configuration>""", res)
Example #4
0
    def test_build_workflow_for_job_hive(self, job_binary):

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, configs={})
        job_binary.return_value = {"name": "script.q"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')

        res = workflow_factory.get_workflow_xml(job, u.create_cluster(),
                                                job_exec, input_data,
                                                output_data, 'hadoop')

        doc = xml.parseString(res)
        hive = doc.getElementsByTagName('hive')[0]
        self.assertEqual(xmlutils.get_text_from_node(hive, 'job-xml'),
                         '/user/hadoop/conf/hive-site.xml')

        configuration = hive.getElementsByTagName('configuration')
        properties = xmlutils.get_property_dict(configuration[0])
        self.assertEqual(
            {
                'fs.swift.service.sahara.password': '******',
                'fs.swift.service.sahara.username': '******'
            }, properties)

        self.assertEqual(xmlutils.get_text_from_node(hive, 'script'),
                         'script.q')

        params = xmlutils.get_param_dict(hive)
        self.assertEqual(
            {
                'INPUT': 'swift://ex.sahara/i',
                'OUTPUT': 'swift://ex.sahara/o'
            }, params)

        # testing workflow creation with a proxy domain
        self.override_config('use_domain_for_proxy_users', True)
        self.override_config("proxy_user_domain_name", 'sahara_proxy_domain')

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, proxy=True)

        res = workflow_factory.get_workflow_xml(job, u.create_cluster(),
                                                job_exec, input_data,
                                                output_data, 'hadoop')

        doc = xml.parseString(res)
        hive = doc.getElementsByTagName('hive')[0]
        configuration = hive.getElementsByTagName('configuration')
        properties = xmlutils.get_property_dict(configuration[0])
        self.assertEqual(
            {
                'fs.swift.service.sahara.domain.name':
                'sahara_proxy_domain',
                'fs.swift.service.sahara.trust.id':
                '0123456789abcdef0123456789abcdef',
                'fs.swift.service.sahara.password':
                '******',
                'fs.swift.service.sahara.username':
                '******'
            }, properties)
Example #5
0
    def test_build_workflow_for_job_hive(self, job_binary):

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, configs={})
        job_binary.return_value = {"name": "script.q"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        doc = xml.parseString(res)
        hive = doc.getElementsByTagName('hive')[0]
        self.assertEqual('/user/hadoop/conf/hive-site.xml',
                         xmlutils.get_text_from_node(hive, 'job-xml'))

        configuration = hive.getElementsByTagName('configuration')
        properties = xmlutils.get_property_dict(configuration[0])
        self.assertEqual({'fs.swift.service.sahara.password': '******',
                          'fs.swift.service.sahara.username': '******'},
                         properties)

        self.assertEqual('script.q',
                         xmlutils.get_text_from_node(hive, 'script'))

        params = xmlutils.get_param_dict(hive)
        self.assertEqual({'INPUT': 'swift://ex.sahara/i',
                          'OUTPUT': 'swift://ex.sahara/o'}, params)

        # testing workflow creation with a proxy domain
        self.override_config('use_domain_for_proxy_users', True)
        self.override_config("proxy_user_domain_name", 'sahara_proxy_domain')

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, proxy=True)

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        doc = xml.parseString(res)
        hive = doc.getElementsByTagName('hive')[0]
        configuration = hive.getElementsByTagName('configuration')
        properties = xmlutils.get_property_dict(configuration[0])
        self.assertEqual({
            'fs.swift.service.sahara.domain.name':
            'sahara_proxy_domain',

            'fs.swift.service.sahara.trust.id':
            '0123456789abcdef0123456789abcdef',

            'fs.swift.service.sahara.password':
            '******',

            'fs.swift.service.sahara.username':
            '******'}, properties)
Example #6
0
    def test_build_workflow_for_job_hive(self, job_binary):

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, configs={})
        job_binary.return_value = {"name": "script.q"}

        input_data = u.create_data_source("swift://ex/i")
        output_data = u.create_data_source("swift://ex/o")
        data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
        )

        doc = xml.parseString(res)
        hive = doc.getElementsByTagName("hive")[0]
        self.assertEqual("/user/hadoop/conf/hive-site.xml", xmlutils.get_text_from_node(hive, "job-xml"))

        configuration = hive.getElementsByTagName("configuration")
        properties = xmlutils.get_property_dict(configuration[0])
        self.assertEqual(
            {"fs.swift.service.sahara.password": "******", "fs.swift.service.sahara.username": "******"}, properties
        )

        self.assertEqual("script.q", xmlutils.get_text_from_node(hive, "script"))

        params = xmlutils.get_param_dict(hive)
        self.assertEqual({"INPUT": "swift://ex.sahara/i", "OUTPUT": "swift://ex.sahara/o"}, params)

        # testing workflow creation with a proxy domain
        self.override_config("use_domain_for_proxy_users", True)
        self.override_config("proxy_user_domain_name", "sahara_proxy_domain")

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE, proxy=True)

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
        )

        doc = xml.parseString(res)
        hive = doc.getElementsByTagName("hive")[0]
        configuration = hive.getElementsByTagName("configuration")
        properties = xmlutils.get_property_dict(configuration[0])
        self.assertEqual(
            {
                "fs.swift.service.sahara.domain.name": "sahara_proxy_domain",
                "fs.swift.service.sahara.trust.id": "0123456789abcdef0123456789abcdef",
                "fs.swift.service.sahara.password": "******",
                "fs.swift.service.sahara.username": "******",
            },
            properties,
        )
Example #7
0
    def test_build_workflow_for_job_java(self):
        # If args include swift paths, user and password values
        # will have to be supplied via configs instead of being
        # lifted from input or output data sources
        configs = {sw.HADOOP_SWIFT_USERNAME: '******',
                   sw.HADOOP_SWIFT_PASSWORD: '******'}

        configs = {
            'configs': configs,
            'args': ['swift://ex/i',
                     'output_path']
        }

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs)
        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>
      <main-class>%s</main-class>
      <java-opts>%s</java-opts>
      <arg>swift://ex.sahara/i</arg>
      <arg>output_path</arg>""" % (_java_main_class, _java_opts), res)
Example #8
0
    def test_build_workflow_for_job_pig(self, job_binary):

        job, job_exec = _create_all_stack(edp.JOB_TYPE_PIG)
        job_binary.return_value = {"name": "script.pig"}

        input_data = _create_data_source('swift://ex/i')
        output_data = _create_data_source('swift://ex/o')

        res = workflow_factory.get_workflow_xml(
            job, _create_cluster(), job_exec, input_data, output_data)

        self.assertIn("""
      <param>INPUT=swift://ex.sahara/i</param>
      <param>OUTPUT=swift://ex.sahara/o</param>""", res)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        self.assertIn("<script>script.pig</script>", res)
Example #9
0
    def _build_workflow_with_conf_common(self, job_type):

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')

        job, job_exec = u.create_job_exec(job_type,
                                          configs={"configs": {'c': 'f'}})

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        self.assertIn("""
        <property>
          <name>c</name>
          <value>f</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.sahara/i</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.sahara/o</value>
        </property>""", res)
Example #10
0
    def test_build_workflow_for_job_hive(self, job_binary):

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_HIVE)
        job_binary.return_value = {"name": "script.q"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        self.assertIn("""
      <job-xml>/user/hadoop/conf/hive-site.xml</job-xml>
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>
      <script>script.q</script>
      <param>INPUT=swift://ex.sahara/i</param>
      <param>OUTPUT=swift://ex.sahara/o</param>""", res)
Example #11
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(
                job_execution.job_configs))

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(job, self.cluster,
                                                   updated_job_configs,
                                                   input_source, output_source,
                                                   hdfs_user)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user, path_to_workflow)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)
        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Example #12
0
    def test_build_workflow_for_job_java_with_adapter(self, edp_conf_mock):
        edp_conf_mock.return_value = True

        configs = {"configs": {"edp.java.main_class": "some_main"}}
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs)
        res = workflow_factory.get_workflow_xml(job, u.create_cluster(), job_exec.job_configs)

        self.assertIn("<main-class>org.openstack.sahara.edp.MainWrapper</main-class>", res)
        self.assertNotIn("some_main", res)
Example #13
0
    def test_build_workflow_for_job_java_with_adapter(self, edp_conf_mock):
        edp_conf_mock.return_value = True

        configs = {"configs": {"edp.java.main_class": "some_main"}}
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs)
        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs)

        self.assertIn(
            "<main-class>org.openstack.sahara.edp.MainWrapper</main-class>",
            res)
        self.assertNotIn("some_main", res)
Example #14
0
    def test_build_workflow_for_job_shell(self):
        configs = {"configs": {"k1": "v1"}, "params": {"p1": "v1"}, "args": ["a1", "a2"]}
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_SHELL, configs)
        res = workflow_factory.get_workflow_xml(job, u.create_cluster(), job_exec.job_configs)

        self.assertIn("<name>k1</name>", res)
        self.assertIn("<value>v1</value>", res)

        self.assertIn("<env-var>p1=v1</env-var>", res)

        self.assertIn("<argument>a1</argument>", res)
        self.assertIn("<argument>a2</argument>", res)
Example #15
0
    def test_build_workflow_for_job_shell(self):
        configs = {"configs": {"k1": "v1"},
                   "params": {"p1": "v1"},
                   "args": ["a1", "a2"]}
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_SHELL, configs)
        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs)

        self.assertIn("<name>k1</name>", res)
        self.assertIn("<value>v1</value>", res)

        self.assertIn("<env-var>p1=v1</env-var>", res)

        self.assertIn("<argument>a1</argument>", res)
        self.assertIn("<argument>a2</argument>", res)
Example #16
0
    def _build_workflow_common(self, job_type, streaming=False):
        if streaming:
            configs = {'edp.streaming.mapper': '/usr/bin/cat',
                       'edp.streaming.reducer': '/usr/bin/wc'}
            configs = {'configs': configs}
        else:
            configs = {}

        job, job_exec = u.create_job_exec(job_type, configs)

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        if streaming:
            self.assertIn("""
      <streaming>
        <mapper>/usr/bin/cat</mapper>
        <reducer>/usr/bin/wc</reducer>
      </streaming>""", res)

        self.assertIn("""
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.sahara/o</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.sahara/i</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>""", res)
Example #17
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(job_execution,
                                                                 job)
        proxy_configs = job_execution.job_configs.get('proxy_configs')

        for data_source in [input_source, output_source]:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, job_execution, input_source, output_source,
            hdfs_user)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)
        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Example #18
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(job_execution,
                                                                 job)

        for data_source in [input_source, output_source]:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        hdfs_user = self.plugin.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the plugin api now.
        # However, other engines may need it.
        oozie_server = self.plugin.get_oozie_server(self.cluster)

        wf_dir = job_utils.create_hdfs_workflow_dir(oozie_server,
                                                    job, hdfs_user)
        job_utils.upload_job_files_to_hdfs(oozie_server, wf_dir,
                                           job, hdfs_user)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, job_execution, input_source, output_source)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Example #19
0
    def test_build_workflow_for_job_java(self):
        # If args include swift paths, user and password values
        # will have to be supplied via configs instead of being
        # lifted from input or output data sources
        configs = {sw.HADOOP_SWIFT_USERNAME: '******',
                   sw.HADOOP_SWIFT_PASSWORD: '******'}

        configs = {
            'configs': configs,
            'args': ['swift://ex/i',
                     'output_path']
        }

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs)
        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>
      <main-class>%s</main-class>
      <java-opts>%s</java-opts>
      <arg>swift://ex.sahara/i</arg>
      <arg>output_path</arg>""" % (_java_main_class, _java_opts), res)

        # testing workflow creation with a proxy domain
        self.override_config('use_domain_for_proxy_users', True)
        self.override_config("proxy_user_domain_name", 'sahara_proxy_domain')
        configs = {
            'configs': {},
            'args': ['swift://ex/i',
                     'output_path']
        }

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs,
                                          proxy=True)
        res = workflow_factory.get_workflow_xml(job, u.create_cluster(),
                                                job_exec)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.domain.name</name>
          <value>sahara_proxy_domain</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>55555555-6666-7777-8888-999999999999</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.trust.id</name>
          <value>0123456789abcdef0123456789abcdef</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>job_00000000-1111-2222-3333-4444444444444444</value>
        </property>
      </configuration>
      <main-class>%s</main-class>
      <java-opts>%s</java-opts>
      <arg>swift://ex.sahara/i</arg>
      <arg>output_path</arg>""" % (_java_main_class, _java_opts), res)
Example #20
0
    def _build_workflow_common(self, job_type, streaming=False, proxy=False):
        if streaming:
            configs = {'edp.streaming.mapper': '/usr/bin/cat',
                       'edp.streaming.reducer': '/usr/bin/wc'}
            configs = {'configs': configs}
        else:
            configs = {}

        job, job_exec = u.create_job_exec(job_type, configs)

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec, input_data, output_data,
            'hadoop')

        if streaming:
            self.assertIn("""
      <streaming>
        <mapper>/usr/bin/cat</mapper>
        <reducer>/usr/bin/wc</reducer>
      </streaming>""", res)

        self.assertIn("""
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.sahara/o</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.sahara/i</value>
        </property>""", res)

        if not proxy:
            self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>""", res)

            self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>""", res)
        else:
            # testing workflow creation with a proxy domain
            self.override_config('use_domain_for_proxy_users', True)
            self.override_config("proxy_user_domain_name",
                                 'sahara_proxy_domain')
            job, job_exec = u.create_job_exec(job_type, proxy=True)

            res = workflow_factory.get_workflow_xml(
                job, u.create_cluster(), job_exec, input_data, output_data,
                'hadoop')

            self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.domain.name</name>
          <value>sahara_proxy_domain</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>55555555-6666-7777-8888-999999999999</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.trust.id</name>
          <value>0123456789abcdef0123456789abcdef</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>job_00000000-1111-2222-3333-4444444444444444</value>
        </property>""", res)
Example #21
0
    def _build_workflow_common(self, job_type, streaming=False, proxy=False):
        if streaming:
            configs = {"edp.streaming.mapper": "/usr/bin/cat", "edp.streaming.reducer": "/usr/bin/wc"}
            configs = {"configs": configs}
        else:
            configs = {}

        job, job_exec = u.create_job_exec(job_type, configs)

        input_data = u.create_data_source("swift://ex/i")
        output_data = u.create_data_source("swift://ex/o")
        data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
        )

        if streaming:
            self.assertIn(
                """
      <streaming>
        <mapper>/usr/bin/cat</mapper>
        <reducer>/usr/bin/wc</reducer>
      </streaming>""",
                res,
            )

        self.assertIn(
            """
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.sahara/o</value>
        </property>""",
            res,
        )

        self.assertIn(
            """
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.sahara/i</value>
        </property>""",
            res,
        )

        if not proxy:
            self.assertIn(
                """
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>""",
                res,
            )

            self.assertIn(
                """
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>""",
                res,
            )
        else:
            # testing workflow creation with a proxy domain
            self.override_config("use_domain_for_proxy_users", True)
            self.override_config("proxy_user_domain_name", "sahara_proxy_domain")
            job, job_exec = u.create_job_exec(job_type, proxy=True)

            res = workflow_factory.get_workflow_xml(
                job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
            )

            self.assertIn(
                """
        <property>
          <name>fs.swift.service.sahara.domain.name</name>
          <value>sahara_proxy_domain</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>55555555-6666-7777-8888-999999999999</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.trust.id</name>
          <value>0123456789abcdef0123456789abcdef</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>job_00000000-1111-2222-3333-4444444444444444</value>
        </property>""",
                res,
            )
Example #22
0
    def _prepare_run_job(self, job_execution):
        ctx = context.ctx()

        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}

        prepared_job_params = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job, data_source_urls, self.cluster)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})
        use_hbase_lib = configs.get('edp.hbase_common_lib', {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith('oozie.'):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(
                    self.cluster, data_source_urls[data_source.id])
                break

        external_hdfs_urls = self._resolve_external_hdfs_urls(
            job_execution.job_configs)
        for url in external_hdfs_urls:
            h.configure_cluster_for_hdfs(self.cluster, url)

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs,
            input_source, output_source,
            hdfs_user, data_source_urls)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        prepared_job_params['context'] = ctx
        prepared_job_params['hdfs_user'] = hdfs_user
        prepared_job_params['path_to_workflow'] = path_to_workflow
        prepared_job_params['use_hbase_lib'] = use_hbase_lib
        prepared_job_params['job_execution'] = job_execution
        prepared_job_params['oozie_params'] = oozie_params
        prepared_job_params['wf_dir'] = wf_dir
        prepared_job_params['oozie_server'] = oozie_server

        return prepared_job_params
Example #23
0
    def _prepare_run_job(self, job_execution):
        ctx = context.ctx()

        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}

        prepared_job_params = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job, data_source_urls, self.cluster)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})
        use_hbase_lib = configs.get('edp.hbase_common_lib', {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith('oozie.'):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(
                    self.cluster, data_source_urls[data_source.id])
                break

        external_hdfs_urls = self._resolve_external_hdfs_urls(
            job_execution.job_configs)
        for url in external_hdfs_urls:
            h.configure_cluster_for_hdfs(self.cluster, url)

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs,
            input_source, output_source,
            hdfs_user, data_source_urls)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        prepared_job_params['context'] = ctx
        prepared_job_params['hdfs_user'] = hdfs_user
        prepared_job_params['path_to_workflow'] = path_to_workflow
        prepared_job_params['use_hbase_lib'] = use_hbase_lib
        prepared_job_params['job_execution'] = job_execution
        prepared_job_params['oozie_params'] = oozie_params
        prepared_job_params['wf_dir'] = wf_dir
        prepared_job_params['oozie_server'] = oozie_server

        return prepared_job_params
Example #24
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        data_source_urls = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(job_execution, job, data_source_urls)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = job_utils.resolve_data_source_references(
            job_execution.job_configs, job_execution.id, data_source_urls
        )

        job_execution = conductor.job_execution_update(ctx, job_execution, {"data_source_urls": data_source_urls})

        proxy_configs = updated_job_configs.get("proxy_configs")
        configs = updated_job_configs.get("configs", {})
        use_hbase_lib = configs.get("edp.hbase_common_lib", {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith("oozie."):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == "hdfs":
                h.configure_cluster_for_hdfs(self.cluster, data_source_urls[data_source.id])
                break

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs, proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs, input_source, output_source, hdfs_user, data_source_urls
        )

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user, path_to_workflow, oozie_params, use_hbase_lib)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params), job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info["status"] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)
        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution, oozie_job_id)["status"]
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Example #25
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job, data_source_urls, self.cluster)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})
        use_hbase_lib = configs.get('edp.hbase_common_lib', {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith('oozie.'):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(
                    self.cluster, data_source_urls[data_source.id])
                break

        external_hdfs_urls = self._resolve_external_hdfs_urls(
            job_execution.job_configs)
        for url in external_hdfs_urls:
            h.configure_cluster_for_hdfs(self.cluster, url)

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs,
            input_source, output_source,
            hdfs_user, data_source_urls)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow,
                                                oozie_params,
                                                use_hbase_lib)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        conductor.job_execution_update(
            context.ctx(), job_execution.id,
            {'info': {'status': edp.JOB_STATUS_READYTORUN},
             'engine_job_id': oozie_job_id})

        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Example #26
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job, data_source_urls, self.cluster)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})
        use_hbase_lib = configs.get('edp.hbase_common_lib', {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith('oozie.'):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(
                    self.cluster, data_source_urls[data_source.id])
                break

        external_hdfs_urls = self._resolve_external_hdfs_urls(
            job_execution.job_configs)
        for url in external_hdfs_urls:
            h.configure_cluster_for_hdfs(self.cluster, url)

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs,
            input_source, output_source,
            hdfs_user, data_source_urls)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow,
                                                oozie_params,
                                                use_hbase_lib)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        conductor.job_execution_update(
            context.ctx(), job_execution.id,
            {'info': {'status': edp.JOB_STATUS_READYTORUN},
             'engine_job_id': oozie_job_id})

        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_info(job_execution, oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Example #27
0
    def test_build_workflow_swift_configs(self, job_binary):

        # Test that swift configs come from either input or output data sources
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={})
        job_binary.return_value = {"name": "script.pig"}

        input_data = u.create_data_source("swift://ex/i")
        output_data = u.create_data_source("hdfs://user/hadoop/out")
        data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
        )

        self.assertIn(
            """
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""",
            res,
        )

        input_data = u.create_data_source("hdfs://user/hadoop/in")
        output_data = u.create_data_source("swift://ex/o")
        data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
        )

        self.assertIn(
            """
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""",
            res,
        )

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={"configs": {"dummy": "value"}})
        input_data = u.create_data_source("hdfs://user/hadoop/in")
        output_data = u.create_data_source("hdfs://user/hadoop/out")
        data_source_urls = {input_data.id: input_data.url, output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs, input_data, output_data, "hadoop", data_source_urls
        )

        self.assertIn(
            """
      <configuration>
        <property>
          <name>dummy</name>
          <value>value</value>
        </property>
      </configuration>""",
            res,
        )
Example #28
0
    def test_build_workflow_for_job_java(self):
        # If args include swift paths, user and password values
        # will have to be supplied via configs instead of being
        # lifted from input or output data sources
        configs = {sw.HADOOP_SWIFT_USERNAME: '******',
                   sw.HADOOP_SWIFT_PASSWORD: '******'}

        configs = {
            'configs': configs,
            'args': ['swift://ex/i',
                     'output_path']
        }

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs)
        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>
      <main-class>%s</main-class>
      <java-opts>%s</java-opts>
      <arg>swift://ex.sahara/i</arg>
      <arg>output_path</arg>""" % (_java_main_class, _java_opts), res)

        # testing workflow creation with a proxy domain
        self.override_config('use_domain_for_proxy_users', True)
        self.override_config("proxy_user_domain_name", 'sahara_proxy_domain')
        configs = {
            'configs': {},
            'args': ['swift://ex/i',
                     'output_path']
        }

        job, job_exec = u.create_job_exec(edp.JOB_TYPE_JAVA, configs,
                                          proxy=True)
        res = workflow_factory.get_workflow_xml(job, u.create_cluster(),
                                                job_exec.job_configs)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.domain.name</name>
          <value>sahara_proxy_domain</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>55555555-6666-7777-8888-999999999999</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.trust.id</name>
          <value>0123456789abcdef0123456789abcdef</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>job_00000000-1111-2222-3333-4444444444444444</value>
        </property>
      </configuration>
      <main-class>%s</main-class>
      <java-opts>%s</java-opts>
      <arg>swift://ex.sahara/i</arg>
      <arg>output_path</arg>""" % (_java_main_class, _java_opts), res)
Example #29
0
    def _build_workflow_common(self, job_type, streaming=False, proxy=False):
        if streaming:
            configs = {'edp.streaming.mapper': '/usr/bin/cat',
                       'edp.streaming.reducer': '/usr/bin/wc'}
            configs = {'configs': configs}
        else:
            configs = {}

        job, job_exec = u.create_job_exec(job_type, configs)

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('swift://ex/o')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        if streaming:
            self.assertIn("""
      <streaming>
        <mapper>/usr/bin/cat</mapper>
        <reducer>/usr/bin/wc</reducer>
      </streaming>""", res)

        self.assertIn("""
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.sahara/o</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.sahara/i</value>
        </property>""", res)

        if not proxy:
            self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>""", res)

            self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>""", res)
        else:
            # testing workflow creation with a proxy domain
            self.override_config('use_domain_for_proxy_users', True)
            self.override_config("proxy_user_domain_name",
                                 'sahara_proxy_domain')
            job, job_exec = u.create_job_exec(job_type, proxy=True)

            res = workflow_factory.get_workflow_xml(
                job, u.create_cluster(), job_exec.job_configs,
                input_data, output_data, 'hadoop', data_source_urls)

            self.assertIn("""
        <property>
          <name>fs.swift.service.sahara.domain.name</name>
          <value>sahara_proxy_domain</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>55555555-6666-7777-8888-999999999999</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.trust.id</name>
          <value>0123456789abcdef0123456789abcdef</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>job_00000000-1111-2222-3333-4444444444444444</value>
        </property>""", res)
Example #30
0
    def test_build_workflow_swift_configs(self, job_binary):

        # Test that swift configs come from either input or output data sources
        job, job_exec = u.create_job_exec(edp.JOB_TYPE_PIG, configs={})
        job_binary.return_value = {"name": "script.pig"}

        input_data = u.create_data_source('swift://ex/i')
        output_data = u.create_data_source('hdfs://user/hadoop/out')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        input_data = u.create_data_source('hdfs://user/hadoop/in')
        output_data = u.create_data_source('swift://ex/o')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.sahara.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.sahara.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        job, job_exec = u.create_job_exec(
            edp.JOB_TYPE_PIG, configs={'configs': {'dummy': 'value'}})
        input_data = u.create_data_source('hdfs://user/hadoop/in')
        output_data = u.create_data_source('hdfs://user/hadoop/out')
        data_source_urls = {input_data.id: input_data.url,
                            output_data.id: output_data.url}

        res = workflow_factory.get_workflow_xml(
            job, u.create_cluster(), job_exec.job_configs,
            input_data, output_data, 'hadoop', data_source_urls)

        self.assertIn("""
      <configuration>
        <property>
          <name>dummy</name>
          <value>value</value>
        </property>
      </configuration>""", res)