Esempio n. 1
0
def _create_job_binary(id, type):
    binary = mock.Mock()
    binary.id = id
    binary.url = "savanna-db://42"
    if edp.compare_job_type(type, 'Pig'):
        binary.name = "script.pig"
    elif edp.compare_job_type(type, 'MapReduce', 'Java'):
        binary.name = "main.jar"
    else:
        binary.name = "script.q"
    return binary
Esempio n. 2
0
 def test_compare_job_type(self):
     self.assertTrue(
         edp.compare_job_type("Java", "Java", "MapReduce", strict=True))
     self.assertFalse(
         edp.compare_job_type(MAPRED_STREAMING,
                              "Java",
                              "MapReduce",
                              strict=True))
     self.assertTrue(
         edp.compare_job_type(MAPRED_STREAMING, "Java", "MapReduce"))
     self.assertFalse(
         edp.compare_job_type("MapReduce", "Java", MAPRED_STREAMING))
Esempio n. 3
0
def _create_job_exec(job_id, type, configs=None):
    j_exec = mock.Mock()
    j_exec.job_id = job_id
    j_exec.job_configs = configs
    if edp.compare_job_type(type, "Java"):
        j_exec.job_configs['configs']['edp.java.main_class'] = _java_main_class
        j_exec.job_configs['configs']['edp.java.java_opts'] = _java_opts
    return j_exec
Esempio n. 4
0
def _create_job(id, job_binary, type):
    job = mock.Mock()
    job.id = id
    job.type = type
    job.name = 'special_name'
    if edp.compare_job_type(type, 'Pig', 'Hive'):
        job.mains = [job_binary]
        job.libs = None
    else:
        job.libs = [job_binary]
        job.mains = None
    return job
Esempio n. 5
0
def get_possible_job_config(job_type):
    if not edp.compare_job_type(job_type, *get_possible_job_types()):
        return None

    if edp.compare_job_type(job_type, 'Java'):
        return {'job_config': {'configs': [], 'args': []}}

    if edp.compare_job_type(job_type, 'MapReduce', 'Pig'):
        #TODO(nmakhotkin) Savanna should return config based on specific plugin
        cfg = xmlutils.load_hadoop_xml_defaults(
            'plugins/vanilla/v1_2_1/resources/mapred-default.xml')
        if edp.compare_job_type(job_type, 'MapReduce'):
            cfg += xmlutils.load_hadoop_xml_defaults(
                'service/edp/resources/mapred-job-config.xml')
    elif edp.compare_job_type(job_type, 'Hive'):
        #TODO(nmakhotkin) Savanna should return config based on specific plugin
        cfg = xmlutils.load_hadoop_xml_defaults(
            'plugins/vanilla/v1_2_1/resources/hive-default.xml')

    # TODO(tmckay): args should be a list when bug #269968
    # is fixed on the UI side
    config = {'configs': cfg, "args": {}}
    if not edp.compare_job_type('MapReduce', 'Java'):
        config.update({'params': {}})
    return {'job_config': config}
Esempio n. 6
0
    def edp_testing(self,
                    job_type,
                    job_data_list,
                    lib_data_list=None,
                    configs=None,
                    pass_input_output_args=False):
        try:
            swift = self.connect_to_swift()
            container_name = 'Edp-test-%s' % str(uuid.uuid4())[:8]
            swift.put_container(container_name)
            swift.put_object(
                container_name, 'input', ''.join(
                    random.choice(':' + ' ' + '\n' + string.ascii_lowercase)
                    for x in range(10000)))

        except Exception as e:
            with excutils.save_and_reraise_exception():
                self.delete_swift_container(swift, container_name)
                print(str(e))
        input_id = None
        output_id = None
        job_id = None
        job_execution = None
        try:
            job_binary_list = []
            lib_binary_list = []
            job_binary_internal_list = []

            swift_input_url = 'swift://%s.savanna/input' % container_name
            swift_output_url = 'swift://%s.savanna/output' % container_name

            # Java jobs don't use data sources.  Input/output paths must
            # be passed as args with corresponding username/password configs
            if not edp.compare_job_type(job_type, "Java"):
                input_id = self._create_data_source(
                    'input-%s' % str(uuid.uuid4())[:8], 'swift',
                    swift_input_url)
                output_id = self._create_data_source(
                    'output-%s' % str(uuid.uuid4())[:8], 'swift',
                    swift_output_url)

            if job_data_list:
                self._create_job_binaries(job_data_list,
                                          job_binary_internal_list,
                                          job_binary_list)
            if lib_data_list:
                self._create_job_binaries(lib_data_list,
                                          job_binary_internal_list,
                                          lib_binary_list)
            job_id = self._create_job(
                'Edp-test-job-%s' % str(uuid.uuid4())[:8], job_type,
                job_binary_list, lib_binary_list)
            if not configs:
                configs = {}

            # Append the input/output paths with the swift configs
            # if the caller has requested it...
            if edp.compare_job_type(job_type,
                                    "Java") and pass_input_output_args:
                self._add_swift_configs(configs)
                if "args" in configs:
                    configs["args"].extend([swift_input_url, swift_output_url])
                else:
                    configs["args"] = [swift_input_url, swift_output_url]

            job_execution = self.savanna.job_executions.create(job_id,
                                                               self.cluster_id,
                                                               input_id,
                                                               output_id,
                                                               configs=configs)

            if job_execution:
                self._await_job_execution(job_execution)

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(str(e))

        finally:
            self.delete_swift_container(swift, container_name)
            self._delete_job(job_execution, job_id,
                             job_binary_list + lib_binary_list,
                             job_binary_internal_list, input_id, output_id)
Esempio n. 7
0
def validate(input_data, output_data, job):
    if not edp.compare_job_type(job.type, 'Pig', 'MapReduce', 'Hive', 'Java'):
        raise RuntimeError
Esempio n. 8
0
def run_job(job_execution):
    ctx = context.ctx()

    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != 'Active':
        return job_execution

    job = conductor.job_get(ctx, job_execution.job_id)
    if not edp.compare_job_type(job.type, 'Java'):
        input_source = conductor.data_source_get(ctx, job_execution.input_id)
        output_source = conductor.data_source_get(ctx, job_execution.output_id)
    else:
        input_source = None
        output_source = None
    #TODO(nprivalova): should be removed after all features implemented
    validate(input_source, output_source, job)

    for data_source in [input_source, output_source]:
        if data_source and data_source.type == 'hdfs':
            h.configure_cluster_for_hdfs(cluster, data_source)

    hdfs_user = _get_hdfs_user(cluster)
    oozie_server = _get_oozie_server(cluster)
    wf_dir = create_workflow_dir(oozie_server, job, hdfs_user)
    upload_job_files(oozie_server, wf_dir, job, hdfs_user)

    creator = workflow_factory.get_creator(job)

    # Do other job type specific setup here, for example
    # uploading hive configuration
    creator.configure_workflow_if_needed(cluster, wf_dir)

    wf_xml = creator.get_workflow_xml(job_execution, input_source,
                                      output_source)

    path_to_workflow = upload_workflow_file(oozie_server, wf_dir, wf_xml,
                                            hdfs_user)

    rm_path = _get_resource_manager_path(cluster)
    nn_path = cluster['info']['HDFS']['NameNode']

    client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/",
                           _get_oozie_server(cluster))
    job_parameters = {
        "jobTracker": rm_path,
        "nameNode": nn_path,
        "user.name": hdfs_user,
        "oozie.wf.application.path": "%s%s" % (nn_path, path_to_workflow),
        "oozie.use.system.libpath": "true"
    }

    oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters),
                                  job_execution)
    job_execution = conductor.job_execution_update(
        ctx, job_execution, {
            'oozie_job_id': oozie_job_id,
            'start_time': datetime.datetime.now()
        })
    client.run_job(job_execution, oozie_job_id)

    return job_execution