Example #1
0
    def test_build_workflow_for_job_java(self):
        # If args include swift paths, user and password values
        # will have to be supplied via configs instead of being
        # lifted from input or output data sources
        configs = {workflow_factory.swift_username: '******',
                   workflow_factory.swift_password: '******'}

        configs = {
            'configs': configs,
            'args': ['input_path',
                     'output_path']
        }

        job, job_exec = _create_all_stack('Java', configs)
        creator = workflow_factory.get_creator(job)
        res = creator.get_workflow_xml(job_exec)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>
      </configuration>
      <main-class>%s</main-class>
      <java-opts>%s</java-opts>
      <arg>input_path</arg>
      <arg>output_path</arg>""" % (_java_main_class, _java_opts), res)
Example #2
0
    def _build_workflow_with_conf_common(self, job_type):
        job, _ = _create_all_stack(job_type)

        input_data = _create_data_source('swift://ex.savanna/i')
        output_data = _create_data_source('swift://ex.savanna/o')

        job_exec = _create_job_exec(job.id,
                                    job_type, configs={"configs": {'c': 'f'}})

        creator = workflow_factory.get_creator(job)

        res = creator.get_workflow_xml(job_exec,
                                       input_data, output_data)

        self.assertIn("""
        <property>
          <name>c</name>
          <value>f</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.savanna/i</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.savanna/o</value>
        </property>""", res)
Example #3
0
    def test_build_workflow_for_job_pig(self, job_binary):

        job, job_exec = _create_all_stack('Pig')
        job_binary.return_value = {"name": "script.pig"}

        input_data = _create_data_source('swift://ex.savanna/i')
        output_data = _create_data_source('swift://ex.savanna/o')

        creator = workflow_factory.get_creator(job)

        res = creator.get_workflow_xml(job_exec,
                                       input_data, output_data)

        self.assertIn("""
      <param>INPUT=swift://ex.savanna/i</param>
      <param>OUTPUT=swift://ex.savanna/o</param>""", res)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        self.assertIn("<script>script.pig</script>", res)
Example #4
0
    def test_build_workflow_for_job_jar_with_conf(self):
        job, _ = _create_all_stack('Jar')

        input_data = _create_data_source('swift://ex.savanna/i')
        output_data = _create_data_source('swift://ex.savanna/o')

        job_exec = _create_job_exec(job.id, configs={"configs": {'c': 'f'}})

        creator = workflow_factory.get_creator(job)

        res = creator.get_workflow_xml(job_exec.job_configs, input_data,
                                       output_data)

        self.assertIn(
            """
        <property>
          <name>c</name>
          <value>f</value>
        </property>""", res)

        self.assertIn(
            """
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.savanna/i</value>
        </property>""", res)

        self.assertIn(
            """
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.savanna/o</value>
        </property>""", res)
Example #5
0
    def test_build_workflow_for_job_java(self):
        # If args include swift paths, user and password values
        # will have to be supplied via configs instead of being
        # lifted from input or output data sources
        configs = {workflow_factory.swift_username: '******',
                   workflow_factory.swift_password: '******'}

        configs = {
            'configs': configs,
            'args': ['input_path',
                     'output_path']
        }

        job, job_exec = _create_all_stack('Java', configs)
        creator = workflow_factory.get_creator(job)
        res = creator.get_workflow_xml(job_exec)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>
      </configuration>
      <main-class>%s</main-class>
      <java-opts>%s</java-opts>
      <arg>input_path</arg>
      <arg>output_path</arg>""" % (_java_main_class, _java_opts), res)
Example #6
0
    def test_build_workflow_for_job_hive(self, job_binary):

        job, origin = _create_all_stack("Hive")
        job_exec = _create_job_exec(job.id)
        job_binary.return_value = {"name": "script.q"}

        input_data = _create_data_source("swift://ex.savanna/i")
        output_data = _create_data_source("swift://ex.savanna/o")

        creator = workflow_factory.get_creator("Hive", origin)

        res = creator.get_workflow_xml(job_exec.job_configs, input_data, output_data)

        self.assertIn(
            """
      <job-xml>hive-site.xml</job-xml>
      <configuration>
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>
      </configuration>
      <script>script.q</script>
      <param>INPUT=swift://ex.savanna/i</param>
      <param>OUTPUT=swift://ex.savanna/o</param>""",
            res,
        )
Example #7
0
    def _build_workflow_common(self, job_type):
        job, job_exec = _create_all_stack(job_type)

        input_data = _create_data_source('swift://ex.savanna/i')
        output_data = _create_data_source('swift://ex.savanna/o')

        creator = workflow_factory.get_creator(job)

        res = creator.get_workflow_xml(job_exec,
                                       input_data, output_data)

        self.assertIn("""
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.savanna/o</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.savanna/i</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>""", res)
Example #8
0
    def test_build_workflow_for_job_pig(self, job_binary):

        job, job_exec = _create_all_stack('Pig')
        job_binary.return_value = {"name": "script.pig"}

        input_data = _create_data_source('swift://ex.savanna/i')
        output_data = _create_data_source('swift://ex.savanna/o')

        creator = workflow_factory.get_creator(job)

        res = creator.get_workflow_xml(job_exec.job_configs, input_data,
                                       output_data)

        self.assertIn(
            """
      <param>INPUT=swift://ex.savanna/i</param>
      <param>OUTPUT=swift://ex.savanna/o</param>""", res)

        self.assertIn(
            """
      <configuration>
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        self.assertIn("<script>script.pig</script>", res)
Example #9
0
def run_job(job_execution):
    ctx = context.ctx()

    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != 'Active':
        return job_execution

    job = conductor.job_get(ctx, job_execution.job_id)

    input_source = conductor.data_source_get(ctx,  job_execution.input_id)
    output_source = conductor.data_source_get(ctx,  job_execution.output_id)
    #TODO(nprivalova): should be removed after all features implemented
    validate(input_source, output_source, job)

    plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name)
    hdfs_user = plugin.get_hdfs_user()
    wf_dir = create_workflow_dir(u.get_jobtracker(cluster), job, hdfs_user)
    upload_job_files(u.get_jobtracker(cluster), wf_dir, job, hdfs_user)

    creator = workflow_factory.get_creator(job)

    # Do other job type specific setup here, for example
    # uploading hive configuration
    creator.configure_workflow_if_needed(cluster, wf_dir)

    wf_xml = creator.get_workflow_xml(job_execution.job_configs,
                                      input_source, output_source)

    path_to_workflow = upload_workflow_file(u.get_jobtracker(cluster),
                                            wf_dir, wf_xml, hdfs_user)

    jt_path = cluster['info']['MapReduce']['JobTracker']
    nn_path = cluster['info']['HDFS']['NameNode']

    client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/")
    job_parameters = {"jobTracker": jt_path,
                      "nameNode": nn_path,
                      "user.name": hdfs_user,
                      "oozie.wf.application.path":
                      "%s%s" % (nn_path, path_to_workflow),
                      "oozie.use.system.libpath": "true"}

    oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters))
    client.run_job(oozie_job_id)
    job_execution = conductor.job_execution_update(ctx, job_execution,
                                                   {'oozie_job_id':
                                                    oozie_job_id,
                                                    'start_time':
                                                    datetime.datetime.now()})

    return job_execution
Example #10
0
def run_job(job_execution):
    ctx = context.ctx()

    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != 'Active':
        return job_execution

    job = conductor.job_get(ctx, job_execution.job_id)
    input_source = conductor.data_source_get(ctx, job_execution.input_id)
    output_source = conductor.data_source_get(ctx, job_execution.output_id)
    #TODO(nprivalova): should be removed after all features implemented
    validate(input_source, output_source, job)

    plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name)
    hdfs_user = plugin.get_hdfs_user()
    wf_dir = create_workflow_dir(u.get_jobtracker(cluster), job, hdfs_user)
    upload_job_files(u.get_jobtracker(cluster), wf_dir, job, hdfs_user)

    creator = workflow_factory.get_creator(job)

    # Do other job type specific setup here, for example
    # uploading hive configuration
    creator.configure_workflow_if_needed(cluster, wf_dir)

    wf_xml = creator.get_workflow_xml(job_execution.job_configs, input_source,
                                      output_source)

    path_to_workflow = upload_workflow_file(u.get_jobtracker(cluster), wf_dir,
                                            wf_xml, hdfs_user)

    jt_path = '%s:8021' % u.get_jobtracker(cluster).hostname
    nn_path = 'hdfs://%s:8020' % u.get_namenode(cluster).hostname

    client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/")
    job_parameters = {
        "jobTracker": jt_path,
        "nameNode": nn_path,
        "user.name": "hadoop",
        "oozie.wf.application.path": "%s%s" % (nn_path, path_to_workflow),
        "oozie.use.system.libpath": "true"
    }

    oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters))
    client.run_job(oozie_job_id)
    job_execution = conductor.job_execution_update(
        ctx, job_execution, {
            'oozie_job_id': oozie_job_id,
            'start_time': datetime.datetime.now()
        })

    return job_execution
Example #11
0
    def _build_workflow_common(self, job_type, streaming=False):
        if streaming:
            configs = {'edp.streaming.mapper': '/usr/bin/cat',
                       'edp.streaming.reducer': '/usr/bin/wc'}
            configs = {'configs': configs}
        else:
            configs = {}

        job, job_exec = _create_all_stack(job_type, configs)

        input_data = _create_data_source('swift://ex.savanna/i')
        output_data = _create_data_source('swift://ex.savanna/o')

        creator = workflow_factory.get_creator(job)

        res = creator.get_workflow_xml(job_exec,
                                       input_data, output_data)

        if streaming:
            self.assertIn("""
      <streaming>
        <mapper>/usr/bin/cat</mapper>
        <reducer>/usr/bin/wc</reducer>
      </streaming>""", res)

        self.assertIn("""
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.savanna/o</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.savanna/i</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>""", res)

        self.assertIn("""
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>""", res)
Example #12
0
    def test_build_workflow_for_job_jar(self):

        job, origin = _create_all_stack("Jar")
        job_exec = _create_job_exec(job.id)

        input_data = _create_data_source("swift://ex.savanna/i")
        output_data = _create_data_source("swift://ex.savanna/o")

        creator = workflow_factory.get_creator("Jar", origin)

        res = creator.get_workflow_xml(job_exec.job_configs, input_data, output_data)

        self.assertIn(
            """
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.savanna/o</value>
        </property>""",
            res,
        )

        self.assertIn(
            """
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.savanna/i</value>
        </property>""",
            res,
        )

        self.assertIn(
            """
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>""",
            res,
        )

        self.assertIn(
            """
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>""",
            res,
        )
Example #13
0
def run_job(ctx, job_execution):
    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != "Active":
        return job_execution

    job = conductor.job_get(ctx, job_execution.job_id)
    job_origin = conductor.job_origin_get(context.ctx(), job.job_origin_id)
    input_source = conductor.data_source_get(ctx, job_execution.input_id)
    output_source = conductor.data_source_get(ctx, job_execution.output_id)
    # TODO(nprivalova): should be removed after all features implemented
    validate(input_source, output_source, job)

    wf_dir = create_workflow_dir(u.get_jobtracker(cluster), job)
    upload_job_files(u.get_jobtracker(cluster), wf_dir, job_origin)

    creator = workflow_factory.get_creator(job.type, job_origin)

    # Do other job type specific setup here, for example
    # uploading hive configuration
    creator.configure_workflow_if_needed(cluster, wf_dir)

    wf_xml = creator.get_workflow_xml(job_execution.job_configs, input_source, output_source)

    path_to_workflow = upload_workflow_file(u.get_jobtracker(cluster), wf_dir, wf_xml)

    jt_path = "%s:8021" % u.get_jobtracker(cluster).hostname
    nn_path = "hdfs://%s:8020" % u.get_namenode(cluster).hostname

    client = o.OozieClient(cluster["info"]["JobFlow"]["Oozie"] + "/oozie/")
    job_parameters = {
        "jobTracker": jt_path,
        "nameNode": nn_path,
        "user.name": "hadoop",
        "oozie.wf.application.path": "%s%s" % (nn_path, path_to_workflow),
        "oozie.use.system.libpath": "true",
    }

    oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters))
    client.run_job(oozie_job_id)
    job_execution = conductor.job_execution_update(
        ctx, job_execution, {"oozie_job_id": oozie_job_id, "start_time": datetime.datetime.now()}
    )

    return job_execution
Example #14
0
    def test_build_workflow_for_job_jar(self):

        job, job_exec = _create_all_stack('Jar')

        input_data = _create_data_source('swift://ex.savanna/i')
        output_data = _create_data_source('swift://ex.savanna/o')

        creator = workflow_factory.get_creator(job)

        res = creator.get_workflow_xml(job_exec.job_configs, input_data,
                                       output_data)

        self.assertIn(
            """
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.savanna/o</value>
        </property>""", res)

        self.assertIn(
            """
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.savanna/i</value>
        </property>""", res)

        self.assertIn(
            """
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>""", res)

        self.assertIn(
            """
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>""", res)
Example #15
0
    def test_build_workflow_for_job_jar_with_conf(self):
        job, origin = _create_all_stack("Jar")

        input_data = _create_data_source("swift://ex.savanna/i")
        output_data = _create_data_source("swift://ex.savanna/o")

        job_exec = _create_job_exec(job.id, configs={"configs": {"c": "f"}})

        creator = workflow_factory.get_creator("Jar", origin)

        res = creator.get_workflow_xml(job_exec.job_configs, input_data, output_data)

        self.assertIn(
            """
        <property>
          <name>c</name>
          <value>f</value>
        </property>""",
            res,
        )

        self.assertIn(
            """
        <property>
          <name>mapred.input.dir</name>
          <value>swift://ex.savanna/i</value>
        </property>""",
            res,
        )

        self.assertIn(
            """
        <property>
          <name>mapred.output.dir</name>
          <value>swift://ex.savanna/o</value>
        </property>""",
            res,
        )
Example #16
0
    def test_build_workflow_swift_configs(self, job_binary):

        # Test that swift configs come from either input or output data sources
        job, job_exec = _create_all_stack('Pig')
        job_binary.return_value = {"name": "script.pig"}

        input_data = _create_data_source('swift://ex.savanna/i')
        output_data = _create_data_source('hdfs://user/hadoop/out')

        creator = workflow_factory.get_creator(job)
        res = creator.get_workflow_xml(job_exec,
                                       input_data, output_data)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        input_data = _create_data_source('hdfs://user/hadoop/in')
        output_data = _create_data_source('swift://ex.savanna/o')

        creator = workflow_factory.get_creator(job)

        res = creator.get_workflow_xml(job_exec,
                                       input_data, output_data)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        job, job_exec = _create_all_stack('Pig', configs={'configs':
                                                          {'dummy': 'value'}})
        input_data = _create_data_source('hdfs://user/hadoop/in')
        output_data = _create_data_source('hdfs://user/hadoop/out')

        creator = workflow_factory.get_creator(job)

        res = creator.get_workflow_xml(job_exec,
                                       input_data, output_data)

        self.assertIn("""
      <configuration>
        <property>
          <name>dummy</name>
          <value>value</value>
        </property>
      </configuration>""", res)
Example #17
0
def run_job(job_execution):
    ctx = context.ctx()

    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != 'Active':
        return job_execution

    job = conductor.job_get(ctx, job_execution.job_id)
    if not edp.compare_job_type(job.type, 'Java'):
        input_source = conductor.data_source_get(ctx, job_execution.input_id)
        output_source = conductor.data_source_get(ctx, job_execution.output_id)
    else:
        input_source = None
        output_source = None
    #TODO(nprivalova): should be removed after all features implemented
    validate(input_source, output_source, job)

    for data_source in [input_source, output_source]:
        if data_source and data_source.type == 'hdfs':
            h.configure_cluster_for_hdfs(cluster, data_source)

    hdfs_user = _get_hdfs_user(cluster)
    oozie_server = _get_oozie_server(cluster)
    wf_dir = create_workflow_dir(oozie_server, job, hdfs_user)
    upload_job_files(oozie_server, wf_dir, job, hdfs_user)

    creator = workflow_factory.get_creator(job)

    # Do other job type specific setup here, for example
    # uploading hive configuration
    creator.configure_workflow_if_needed(cluster, wf_dir)

    wf_xml = creator.get_workflow_xml(job_execution, input_source,
                                      output_source)

    path_to_workflow = upload_workflow_file(oozie_server, wf_dir, wf_xml,
                                            hdfs_user)

    rm_path = _get_resource_manager_path(cluster)
    nn_path = cluster['info']['HDFS']['NameNode']

    client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/",
                           _get_oozie_server(cluster))
    job_parameters = {
        "jobTracker": rm_path,
        "nameNode": nn_path,
        "user.name": hdfs_user,
        "oozie.wf.application.path": "%s%s" % (nn_path, path_to_workflow),
        "oozie.use.system.libpath": "true"
    }

    oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters),
                                  job_execution)
    job_execution = conductor.job_execution_update(
        ctx, job_execution, {
            'oozie_job_id': oozie_job_id,
            'start_time': datetime.datetime.now()
        })
    client.run_job(job_execution, oozie_job_id)

    return job_execution
Example #18
0
    def test_jar_creator_is_mapreduce(self):
        # Ensure that we get the MapReduce workflow factory for 'Jar' jobs
        job, _ = _create_all_stack('Jar')

        creator = workflow_factory.get_creator(job)
        self.assertEqual(type(creator), workflow_factory.MapReduceFactory)
Example #19
0
    def test_build_workflow_swift_configs(self, job_binary):

        # Test that swift configs come from either input or output data sources
        job, job_exec = _create_all_stack('Pig')
        job_binary.return_value = {"name": "script.pig"}

        input_data = _create_data_source('swift://ex.savanna/i')
        output_data = _create_data_source('hdfs://user/hadoop/out')

        creator = workflow_factory.get_creator(job)
        res = creator.get_workflow_xml(job_exec,
                                       input_data, output_data)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        input_data = _create_data_source('hdfs://user/hadoop/in')
        output_data = _create_data_source('swift://ex.savanna/o')

        creator = workflow_factory.get_creator(job)

        res = creator.get_workflow_xml(job_exec,
                                       input_data, output_data)

        self.assertIn("""
      <configuration>
        <property>
          <name>fs.swift.service.savanna.password</name>
          <value>admin1</value>
        </property>
        <property>
          <name>fs.swift.service.savanna.username</name>
          <value>admin</value>
        </property>
      </configuration>""", res)

        job, job_exec = _create_all_stack('Pig', configs={'configs':
                                                          {'dummy': 'value'}})
        input_data = _create_data_source('hdfs://user/hadoop/in')
        output_data = _create_data_source('hdfs://user/hadoop/out')

        creator = workflow_factory.get_creator(job)

        res = creator.get_workflow_xml(job_exec,
                                       input_data, output_data)

        self.assertIn("""
      <configuration>
        <property>
          <name>dummy</name>
          <value>value</value>
        </property>
      </configuration>""", res)