Ejemplo n.º 1
0
def generate_xml_configs(configs, storage_path, nn_hostname, hadoop_port):
    if hadoop_port is None:
        hadoop_port = 8020

    cfg = {
        "fs.defaultFS": "hdfs://%s:%s" % (nn_hostname, str(hadoop_port)),
        "dfs.namenode.name.dir": extract_hadoop_path(storage_path, "/dfs/nn"),
        "dfs.datanode.data.dir": extract_hadoop_path(storage_path, "/dfs/dn"),
        "hadoop.tmp.dir": extract_hadoop_path(storage_path, "/dfs"),
    }

    # inserting user-defined configs
    for key, value in extract_hadoop_xml_confs(configs):
        cfg[key] = value

    # invoking applied configs to appropriate xml files
    core_all = CORE_DEFAULT

    if CONF.enable_data_locality:
        cfg.update(topology.TOPOLOGY_CONFIG)
        # applying vm awareness configs
        core_all += topology.vm_awareness_core_config()

    xml_configs = {"core-site": x.create_hadoop_xml(cfg, core_all), "hdfs-site": x.create_hadoop_xml(cfg, HDFS_DEFAULT)}

    return xml_configs
Ejemplo n.º 2
0
def generate_xml_configs(configs, storage_path, nn_hostname, hadoop_port):
    if hadoop_port is None:
        hadoop_port = 8020

    cfg = {
        'fs.defaultFS': 'hdfs://%s:%s' % (nn_hostname, str(hadoop_port)),
        'dfs.namenode.name.dir': extract_hadoop_path(storage_path, '/dfs/nn'),
        'dfs.datanode.data.dir': extract_hadoop_path(storage_path, '/dfs/dn'),
        'hadoop.tmp.dir': extract_hadoop_path(storage_path, '/dfs'),
    }

    # inserting user-defined configs
    for key, value in extract_hadoop_xml_confs(configs):
        cfg[key] = value

    # invoking applied configs to appropriate xml files
    core_all = CORE_DEFAULT

    if CONF.enable_data_locality:
        cfg.update(topology.TOPOLOGY_CONFIG)
        # applying vm awareness configs
        core_all += topology.vm_awareness_core_config()

    xml_configs = {
        'core-site': x.create_hadoop_xml(cfg, core_all),
        'hdfs-site': x.create_hadoop_xml(cfg, HDFS_DEFAULT)
    }

    return xml_configs
Ejemplo n.º 3
0
def generate_xml_configs(cluster, node_group, hive_mysql_passwd):
    oozie_hostname = vu.get_instance_hostname(vu.get_oozie(cluster))
    hive_hostname = vu.get_instance_hostname(vu.get_hiveserver(cluster))

    ng_configs = node_group.configuration()

    general_cfg = get_general_configs(hive_hostname, hive_mysql_passwd)

    all_cfg = generate_sahara_configs(cluster, node_group)

    # inserting user-defined configs
    for key, value in extract_xml_confs(ng_configs):
        all_cfg[key] = value

    # applying swift configs if user enabled it
    swift_xml_confs = swift.get_swift_configs()
    all_cfg = generate_cfg_from_general(all_cfg, ng_configs, general_cfg)

    # invoking applied configs to appropriate xml files
    core_all = CORE_DEFAULT + swift_xml_confs
    mapred_all = MAPRED_DEFAULT

    if CONF.enable_data_locality:
        all_cfg.update(topology.TOPOLOGY_CONFIG)

        # applying vm awareness configs
        core_all += topology.vm_awareness_core_config()
        mapred_all += topology.vm_awareness_mapred_config()

    xml_configs = {
        'core-site': x.create_hadoop_xml(all_cfg, core_all),
        'mapred-site': x.create_hadoop_xml(all_cfg, mapred_all),
        'hdfs-site': x.create_hadoop_xml(all_cfg, HDFS_DEFAULT)
    }

    if hive_hostname:
        cfg = all_cfg
        cfg_filter = HIVE_DEFAULT
        proxy_configs = cluster.cluster_configs.get('proxy_configs')
        if CONF.use_identity_api_v3 and proxy_configs:
            cfg, cfg_filter = _inject_swift_trust_info(cfg,
                                                       cfg_filter,
                                                       proxy_configs)
        xml_configs.update({'hive-site':
                            x.create_hadoop_xml(cfg, cfg_filter)})
        LOG.debug('Generated hive-site.xml for hive {host}'.format(
            host=hive_hostname))

    if oozie_hostname:
        xml_configs.update({'oozie-site':
                            x.create_hadoop_xml(all_cfg, o_h.OOZIE_DEFAULT)})
        LOG.debug('Generated oozie-site.xml for oozie {host}'.format(
            host=oozie_hostname))

    return xml_configs
Ejemplo n.º 4
0
def generate_xml_configs(cluster, node_group, hive_mysql_passwd):
    oozie_hostname = vu.get_instance_hostname(vu.get_oozie(cluster))
    hive_hostname = vu.get_instance_hostname(vu.get_hiveserver(cluster))

    ng_configs = node_group.configuration()

    general_cfg = get_general_configs(hive_hostname, hive_mysql_passwd)

    all_cfg = generate_sahara_configs(cluster, node_group)

    # inserting user-defined configs
    for key, value in extract_xml_confs(ng_configs):
        all_cfg[key] = value

    # applying swift configs if user enabled it
    swift_xml_confs = swift.get_swift_configs()
    all_cfg = generate_cfg_from_general(all_cfg, ng_configs, general_cfg)

    # invoking applied configs to appropriate xml files
    core_all = CORE_DEFAULT + swift_xml_confs
    mapred_all = MAPRED_DEFAULT

    if CONF.enable_data_locality:
        all_cfg.update(topology.TOPOLOGY_CONFIG)

        # applying vm awareness configs
        core_all += topology.vm_awareness_core_config()
        mapred_all += topology.vm_awareness_mapred_config()

    xml_configs = {
        'core-site': x.create_hadoop_xml(all_cfg, core_all),
        'mapred-site': x.create_hadoop_xml(all_cfg, mapred_all),
        'hdfs-site': x.create_hadoop_xml(all_cfg, HDFS_DEFAULT)
    }

    if hive_hostname:
        cfg = all_cfg
        cfg_filter = HIVE_DEFAULT
        proxy_configs = cluster.cluster_configs.get('proxy_configs')
        if CONF.use_identity_api_v3 and proxy_configs:
            cfg, cfg_filter = _inject_swift_trust_info(cfg, cfg_filter,
                                                       proxy_configs)
        xml_configs.update({'hive-site': x.create_hadoop_xml(cfg, cfg_filter)})
        LOG.debug('Generated hive-site.xml for hive {host}'.format(
            host=hive_hostname))

    if oozie_hostname:
        xml_configs.update(
            {'oozie-site': x.create_hadoop_xml(all_cfg, o_h.OOZIE_DEFAULT)})
        LOG.debug('Generated oozie-site.xml for oozie {host}'.format(
            host=oozie_hostname))

    return xml_configs
Ejemplo n.º 5
0
    def run_job(self, job_execution):
        prepared_job_params = self._prepare_run_job(job_execution)
        path_to_workflow = prepared_job_params['path_to_workflow']
        hdfs_user = prepared_job_params['hdfs_user']
        oozie_params = prepared_job_params['oozie_params']
        use_hbase_lib = prepared_job_params['use_hbase_lib']
        ctx = prepared_job_params['context']
        job_execution = prepared_job_params['job_execution']

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow,
                                                oozie_params,
                                                use_hbase_lib)

        client = self.get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        conductor.job_execution_update(
            context.ctx(), job_execution.id,
            {'info': {'status': edp.JOB_STATUS_READYTORUN},
             'engine_job_id': oozie_job_id})

        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_info(job_execution, oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Ejemplo n.º 6
0
    def _upload_wrapper_xml(self, where, job_dir, job_configs):
        xml_name = 'spark.xml'
        proxy_configs = job_configs.get('proxy_configs')
        configs = {}
        cfgs = job_configs.get('configs', {})
        if proxy_configs:
            configs[sw.HADOOP_SWIFT_USERNAME] = proxy_configs.get(
                'proxy_username')
            configs[sw.HADOOP_SWIFT_PASSWORD] = key_manager.get_secret(
                proxy_configs.get('proxy_password'))
            configs[sw.HADOOP_SWIFT_TRUST_ID] = proxy_configs.get(
                'proxy_trust_id')
            configs[sw.HADOOP_SWIFT_DOMAIN_NAME] = CONF.proxy_user_domain_name
        else:
            targets = [sw.HADOOP_SWIFT_USERNAME]
            configs = {k: cfgs[k] for k in targets if k in cfgs}
            if sw.HADOOP_SWIFT_PASSWORD in cfgs:
                configs[sw.HADOOP_SWIFT_PASSWORD] = (
                    key_manager.get_secret(cfgs[sw.HADOOP_SWIFT_PASSWORD])
                )

        for s3_cfg_key in s3_common.S3_DS_CONFIGS:
            if s3_cfg_key in cfgs:
                if s3_cfg_key == s3_common.S3_SECRET_KEY_CONFIG:
                    configs[s3_cfg_key] = (
                        key_manager.get_secret(cfgs[s3_cfg_key])
                    )
                else:
                    configs[s3_cfg_key] = cfgs[s3_cfg_key]

        content = xmlutils.create_hadoop_xml(configs)
        with remote.get_remote(where) as r:
            dst = os.path.join(job_dir, xml_name)
            r.write_file_to(dst, content)
        return xml_name
Ejemplo n.º 7
0
    def run_job(self, job_execution):
        prepared_job_params = self._prepare_run_job(job_execution)
        path_to_workflow = prepared_job_params['path_to_workflow']
        hdfs_user = prepared_job_params['hdfs_user']
        oozie_params = prepared_job_params['oozie_params']
        use_hbase_lib = prepared_job_params['use_hbase_lib']
        ctx = prepared_job_params['context']
        job_execution = prepared_job_params['job_execution']

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow,
                                                oozie_params,
                                                use_hbase_lib)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        conductor.job_execution_update(
            context.ctx(), job_execution.id,
            {'info': {'status': edp.JOB_STATUS_READYTORUN},
             'engine_job_id': oozie_job_id})

        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_info(job_execution, oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Ejemplo n.º 8
0
def generate_xml_configs(cluster, node_group, hive_mysql_passwd):
    oozie_hostname = _get_hostname(utils.get_oozie(cluster))
    hive_hostname = _get_hostname(utils.get_hiveserver(cluster))

    ng_configs = node_group.configuration()

    general_cfg = get_general_configs(hive_hostname, hive_mysql_passwd)

    all_cfg = generate_sahara_configs(cluster, node_group)

    # inserting user-defined configs
    for key, value in extract_xml_confs(ng_configs):
        all_cfg[key] = value

    # applying swift configs if user enabled it
    swift_xml_confs = swift.get_swift_configs()
    all_cfg = generate_cfg_from_general(all_cfg, ng_configs, general_cfg)

    # invoking applied configs to appropriate xml files
    core_all = CORE_DEFAULT + swift_xml_confs
    mapred_all = MAPRED_DEFAULT

    if CONF.enable_data_locality:
        all_cfg.update(topology.TOPOLOGY_CONFIG)

        # applying vm awareness configs
        core_all += topology.vm_awareness_core_config()
        mapred_all += topology.vm_awareness_mapred_config()

    xml_configs = {
        'core-site': x.create_hadoop_xml(all_cfg, core_all),
        'mapred-site': x.create_hadoop_xml(all_cfg, mapred_all),
        'hdfs-site': x.create_hadoop_xml(all_cfg, HDFS_DEFAULT)
    }

    if hive_hostname:
        xml_configs.update(
            {'hive-site': x.create_hadoop_xml(all_cfg, HIVE_DEFAULT)})
        LOG.debug('Generated hive-site.xml for hive % s', hive_hostname)

    if oozie_hostname:
        xml_configs.update(
            {'oozie-site': x.create_hadoop_xml(all_cfg, o_h.OOZIE_DEFAULT)})
        LOG.debug('Generated oozie-site.xml for oozie % s', oozie_hostname)

    return xml_configs
Ejemplo n.º 9
0
def generate_xml_configs(cluster, node_group, hive_mysql_passwd):
    oozie_hostname = _get_hostname(utils.get_oozie(cluster))
    hive_hostname = _get_hostname(utils.get_hiveserver(cluster))

    ng_configs = node_group.configuration()

    general_cfg = get_general_configs(hive_hostname, hive_mysql_passwd)

    all_cfg = generate_sahara_configs(cluster, node_group)

    # inserting user-defined configs
    for key, value in extract_xml_confs(ng_configs):
        all_cfg[key] = value

    # applying swift configs if user enabled it
    swift_xml_confs = swift.get_swift_configs()
    all_cfg = generate_cfg_from_general(all_cfg, ng_configs, general_cfg)

    # invoking applied configs to appropriate xml files
    core_all = CORE_DEFAULT + swift_xml_confs
    mapred_all = MAPRED_DEFAULT

    if CONF.enable_data_locality:
        all_cfg.update(topology.TOPOLOGY_CONFIG)

        # applying vm awareness configs
        core_all += topology.vm_awareness_core_config()
        mapred_all += topology.vm_awareness_mapred_config()

    xml_configs = {
        'core-site': x.create_hadoop_xml(all_cfg, core_all),
        'mapred-site': x.create_hadoop_xml(all_cfg, mapred_all),
        'hdfs-site': x.create_hadoop_xml(all_cfg, HDFS_DEFAULT)
    }

    if hive_hostname:
        xml_configs.update({'hive-site':
                            x.create_hadoop_xml(all_cfg, HIVE_DEFAULT)})
        LOG.debug('Generated hive-site.xml for hive % s', hive_hostname)

    if oozie_hostname:
        xml_configs.update({'oozie-site':
                            x.create_hadoop_xml(all_cfg, o_h.OOZIE_DEFAULT)})
        LOG.debug('Generated oozie-site.xml for oozie % s', oozie_hostname)

    return xml_configs
Ejemplo n.º 10
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(
                job_execution.job_configs))

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(job, self.cluster,
                                                   updated_job_configs,
                                                   input_source, output_source,
                                                   hdfs_user)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user, path_to_workflow)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)
        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Ejemplo n.º 11
0
def run_job(job_execution):
    ctx = context.ctx()

    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != 'Active':
        return job_execution

    job = conductor.job_get(ctx, job_execution.job_id)
    if not edp.compare_job_type(job.type, 'Java'):
        input_source = conductor.data_source_get(ctx,  job_execution.input_id)
        output_source = conductor.data_source_get(ctx, job_execution.output_id)
    else:
        input_source = None
        output_source = None
    #TODO(nprivalova): should be removed after all features implemented
    validate(input_source, output_source, job)

    for data_source in [input_source, output_source]:
        if data_source and data_source.type == 'hdfs':
            h.configure_cluster_for_hdfs(cluster, data_source)

    hdfs_user = _get_hdfs_user(cluster)
    oozie_server = _get_oozie_server(cluster)
    wf_dir = create_workflow_dir(oozie_server, job, hdfs_user)
    upload_job_files(oozie_server, wf_dir, job, hdfs_user)

    creator = workflow_factory.get_creator(job)

    wf_xml = creator.get_workflow_xml(cluster, job_execution,
                                      input_source, output_source)

    path_to_workflow = upload_workflow_file(oozie_server,
                                            wf_dir, wf_xml, hdfs_user)

    rm_path = _get_resource_manager_path(cluster)
    nn_path = cluster['info']['HDFS']['NameNode']

    client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/",
                           _get_oozie_server(cluster))
    job_parameters = {"jobTracker": rm_path,
                      "nameNode": nn_path,
                      "user.name": hdfs_user,
                      "oozie.wf.application.path":
                      "%s%s" % (nn_path, path_to_workflow),
                      "oozie.use.system.libpath": "true"}

    oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters),
                                  job_execution)
    job_execution = conductor.job_execution_update(ctx, job_execution,
                                                   {'oozie_job_id':
                                                    oozie_job_id,
                                                    'start_time':
                                                    datetime.datetime.now()})
    client.run_job(job_execution, oozie_job_id)

    return job_execution
Ejemplo n.º 12
0
def run_job(job_execution):
    ctx = context.ctx()

    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != 'Active':
        return job_execution

    job = conductor.job_get(ctx, job_execution.job_id)
    if not edp.compare_job_type(job.type, edp.JOB_TYPE_JAVA):
        input_source = conductor.data_source_get(ctx,  job_execution.input_id)
        output_source = conductor.data_source_get(ctx, job_execution.output_id)
    else:
        input_source = None
        output_source = None
    #TODO(nprivalova): should be removed after all features implemented
    validate(input_source, output_source, job)

    for data_source in [input_source, output_source]:
        if data_source and data_source.type == 'hdfs':
            h.configure_cluster_for_hdfs(cluster, data_source)

    hdfs_user = _get_hdfs_user(cluster)
    oozie_server = _get_oozie_server(cluster)
    wf_dir = create_workflow_dir(oozie_server, job, hdfs_user)
    upload_job_files(oozie_server, wf_dir, job, hdfs_user)

    creator = workflow_factory.get_creator(job)

    wf_xml = creator.get_workflow_xml(cluster, job_execution,
                                      input_source, output_source)

    path_to_workflow = upload_workflow_file(oozie_server,
                                            wf_dir, wf_xml, hdfs_user)

    rm_path = _get_resource_manager_path(cluster)
    nn_path = cluster['info']['HDFS']['NameNode']

    client = o.OozieClient(cluster['info']['JobFlow']['Oozie'] + "/oozie/",
                           _get_oozie_server(cluster))
    job_parameters = {"jobTracker": rm_path,
                      "nameNode": nn_path,
                      "user.name": hdfs_user,
                      "oozie.wf.application.path":
                      "%s%s" % (nn_path, path_to_workflow),
                      "oozie.use.system.libpath": "true"}

    oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters),
                                  job_execution)
    job_execution = conductor.job_execution_update(ctx, job_execution,
                                                   {'oozie_job_id':
                                                    oozie_job_id,
                                                    'start_time':
                                                    datetime.datetime.now()})
    client.run_job(job_execution, oozie_job_id)

    return job_execution
Ejemplo n.º 13
0
def generate_xml_configs(configs, storage_path, nn_hostname, hadoop_port):
    if hadoop_port is None:
        hadoop_port = 8020

    cfg = {
        'fs.defaultFS': 'hdfs://%s:%s' % (nn_hostname, str(hadoop_port)),
        'dfs.namenode.name.dir': extract_hadoop_path(storage_path,
                                                     '/dfs/nn'),
        'dfs.datanode.data.dir': extract_hadoop_path(storage_path,
                                                     '/dfs/dn'),
        'hadoop.tmp.dir': extract_hadoop_path(storage_path,
                                              '/dfs'),
        'dfs.hosts': '/etc/hadoop/dn.incl',
        'dfs.hosts.exclude': '/etc/hadoop/dn.excl'
    }

    # inserting user-defined configs
    for key, value in extract_hadoop_xml_confs(configs):
        cfg[key] = value

    # Add the swift defaults if they have not been set by the user
    swft_def = []
    if is_swift_enabled(configs):
        swft_def = SWIFT_DEFAULTS
        swift_configs = extract_name_values(swift.get_swift_configs())
        for key, value in six.iteritems(swift_configs):
            if key not in cfg:
                cfg[key] = value

    # invoking applied configs to appropriate xml files
    core_all = CORE_DEFAULT + swft_def

    if CONF.enable_data_locality:
        cfg.update(topology.TOPOLOGY_CONFIG)
        # applying vm awareness configs
        core_all += topology.vm_awareness_core_config()

    xml_configs = {
        'core-site': x.create_hadoop_xml(cfg, core_all),
        'hdfs-site': x.create_hadoop_xml(cfg, HDFS_DEFAULT)
    }

    return xml_configs
Ejemplo n.º 14
0
def generate_xml_configs(configs, storage_path, nn_hostname, hadoop_port):
    if hadoop_port is None:
        hadoop_port = 8020

    cfg = {
        'fs.defaultFS': 'hdfs://%s:%s' % (nn_hostname, str(hadoop_port)),
        'dfs.namenode.name.dir': extract_hadoop_path(storage_path,
                                                     '/dfs/nn'),
        'dfs.datanode.data.dir': extract_hadoop_path(storage_path,
                                                     '/dfs/dn'),
        'hadoop.tmp.dir': extract_hadoop_path(storage_path,
                                              '/dfs'),
        'dfs.hosts': '/etc/hadoop/dn.incl',
        'dfs.hosts.exclude': '/etc/hadoop/dn.excl'
    }

    # inserting user-defined configs
    for key, value in extract_hadoop_xml_confs(configs):
        cfg[key] = value

    # Add the swift defaults if they have not been set by the user
    swft_def = []
    if is_swift_enabled(configs):
        swft_def = SWIFT_DEFAULTS
        swift_configs = extract_name_values(swift.get_swift_configs())
        for key, value in six.iteritems(swift_configs):
            if key not in cfg:
                cfg[key] = value

    # invoking applied configs to appropriate xml files
    core_all = CORE_DEFAULT + swft_def

    if CONF.enable_data_locality:
        cfg.update(topology.TOPOLOGY_CONFIG)
        # applying vm awareness configs
        core_all += topology.vm_awareness_core_config()

    xml_configs = {
        'core-site': x.create_hadoop_xml(cfg, core_all),
        'hdfs-site': x.create_hadoop_xml(cfg, HDFS_DEFAULT)
    }

    return xml_configs
Ejemplo n.º 15
0
def generate_xml_configs(configs, storage_path, nn_hostname, hadoop_port):
    """dfs.name.dir': extract_hadoop_path(storage_path,
                                            '/lib/hadoop/hdfs/namenode'),
    'dfs.data.dir': extract_hadoop_path(storage_path,
                                            '/lib/hadoop/hdfs/datanode'),
    'dfs.name.dir': storage_path + 'hdfs/name',
    'dfs.data.dir': storage_path + 'hdfs/data',

    'dfs.hosts': '/etc/hadoop/dn.incl',
    'dfs.hosts.exclude': '/etc/hadoop/dn.excl',
    """
    if hadoop_port is None:
        hadoop_port = 8020

    cfg = {
        'fs.defaultFS': 'hdfs://%s:%s' % (nn_hostname, str(hadoop_port)),
        'dfs.namenode.name.dir': extract_hadoop_path(storage_path,
                                                     '/dfs/nn'),
        'dfs.datanode.data.dir': extract_hadoop_path(storage_path,
                                                     '/dfs/dn'),
        'hadoop.tmp.dir': extract_hadoop_path(storage_path,
                                              '/dfs'),
    }

    # inserting user-defined configs
    for key, value in extract_hadoop_xml_confs(configs):
        cfg[key] = value

    # invoking applied configs to appropriate xml files
    core_all = CORE_DEFAULT

    if CONF.enable_data_locality:
        cfg.update(topology.TOPOLOGY_CONFIG)
        # applying vm awareness configs
        core_all += topology.vm_awareness_core_config()

    xml_configs = {
        'core-site': x.create_hadoop_xml(cfg, core_all),
        'hdfs-site': x.create_hadoop_xml(cfg, HDFS_DEFAULT)
    }

    return xml_configs
Ejemplo n.º 16
0
def _configure_swift_to_inst(instance):
    cluster = instance.node_group.cluster
    with instance.remote() as r:
        r.execute_command('sudo curl %s -o %s/hadoop-openstack.jar' % (
            c_helper.get_swift_lib_url(cluster), HADOOP_LIB_DIR))
        core_site = r.read_file_from(PATH_TO_CORE_SITE_XML)
        configs = xmlutils.parse_hadoop_xml_with_name_and_value(core_site)
        configs.extend(swift_helper.get_swift_configs())
        confs = dict((c['name'], c['value']) for c in configs)
        new_core_site = xmlutils.create_hadoop_xml(confs)
        r.write_file_to(PATH_TO_CORE_SITE_XML, new_core_site, run_as_root=True)
Ejemplo n.º 17
0
def _configure_swift_to_inst(instance):
    cluster = instance.node_group.cluster
    with instance.remote() as r:
        r.execute_command(
            'sudo curl %s -o %s/hadoop-openstack.jar' %
            (c_helper.get_swift_lib_url(cluster), HADOOP_LIB_DIR))
        core_site = r.read_file_from(PATH_TO_CORE_SITE_XML)
        configs = xmlutils.parse_hadoop_xml_with_name_and_value(core_site)
        configs.extend(swift_helper.get_swift_configs())
        confs = dict((c['name'], c['value']) for c in configs)
        new_core_site = xmlutils.create_hadoop_xml(confs)
        r.write_file_to(PATH_TO_CORE_SITE_XML, new_core_site, run_as_root=True)
Ejemplo n.º 18
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(job_execution,
                                                                 job)
        proxy_configs = job_execution.job_configs.get('proxy_configs')

        for data_source in [input_source, output_source]:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, job_execution, input_source, output_source,
            hdfs_user)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)
        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Ejemplo n.º 19
0
    def test_create_hadoop_xml(self):
        conf = x.load_hadoop_xml_defaults(
            'tests/unit/resources/test-default.xml')
        self.assertEqual("""<?xml version="1.0" ?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
  <property>
    <name>name1</name>
    <value>some_val1</value>
  </property>
  <property>
    <name>name2</name>
    <value>2</value>
  </property>
</configuration>
""",
                         x.create_hadoop_xml({'name1': 'some_val1',
                                              'name2': 2}, conf),)
Ejemplo n.º 20
0
    def test_create_hadoop_xml(self):
        conf = x.load_hadoop_xml_defaults(
            'tests/unit/resources/test-default.xml')
        self.assertEqual(x.create_hadoop_xml({'name1': 'some_val1',
                                              'name2': 2}, conf),
                         """<?xml version="1.0" ?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
  <property>
    <name>name2</name>
    <value>2</value>
  </property>
  <property>
    <name>name1</name>
    <value>some_val1</value>
  </property>
</configuration>
""")
Ejemplo n.º 21
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(job_execution,
                                                                 job)

        for data_source in [input_source, output_source]:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        hdfs_user = self.plugin.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the plugin api now.
        # However, other engines may need it.
        oozie_server = self.plugin.get_oozie_server(self.cluster)

        wf_dir = job_utils.create_hdfs_workflow_dir(oozie_server,
                                                    job, hdfs_user)
        job_utils.upload_job_files_to_hdfs(oozie_server, wf_dir,
                                           job, hdfs_user)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, job_execution, input_source, output_source)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Ejemplo n.º 22
0
    def _upload_wrapper_xml(self, where, job_dir, job_configs):
        xml_name = 'spark.xml'
        proxy_configs = job_configs.get('proxy_configs')
        configs = {}
        if proxy_configs:
            configs[sw.HADOOP_SWIFT_USERNAME] = proxy_configs.get(
                'proxy_username')
            configs[sw.HADOOP_SWIFT_PASSWORD] = proxy_configs.get(
                'proxy_password')
            configs[sw.HADOOP_SWIFT_TRUST_ID] = proxy_configs.get(
                'proxy_trust_id')
            configs[sw.HADOOP_SWIFT_DOMAIN_NAME] = CONF.proxy_user_domain_name
        else:
            cfgs = job_configs.get('configs', {})
            targets = [sw.HADOOP_SWIFT_USERNAME, sw.HADOOP_SWIFT_PASSWORD]
            configs = {k: cfgs[k] for k in targets if k in cfgs}

        content = xmlutils.create_hadoop_xml(configs)
        with remote.get_remote(where) as r:
            dst = os.path.join(job_dir, xml_name)
            r.write_file_to(dst, content)
        return xml_name
Ejemplo n.º 23
0
    def _upload_wrapper_xml(self, where, job_dir, job_configs):
        xml_name = 'spark.xml'
        proxy_configs = job_configs.get('proxy_configs')
        configs = {}
        if proxy_configs:
            configs[sw.HADOOP_SWIFT_USERNAME] = proxy_configs.get(
                'proxy_username')
            configs[sw.HADOOP_SWIFT_PASSWORD] = proxy_configs.get(
                'proxy_password')
            configs[sw.HADOOP_SWIFT_TRUST_ID] = proxy_configs.get(
                'proxy_trust_id')
            configs[sw.HADOOP_SWIFT_DOMAIN_NAME] = CONF.proxy_user_domain_name
        else:
            cfgs = job_configs.get('configs', {})
            targets = [sw.HADOOP_SWIFT_USERNAME, sw.HADOOP_SWIFT_PASSWORD]
            configs = {k: cfgs[k] for k in targets if k in cfgs}

        content = xmlutils.create_hadoop_xml(configs)
        with remote.get_remote(where) as r:
            dst = os.path.join(job_dir, xml_name)
            r.write_file_to(dst, content)
        return xml_name
Ejemplo n.º 24
0
def _run_job(job_execution_id):
    ctx = context.ctx()

    job_execution = conductor.job_execution_get(ctx, job_execution_id)

    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != 'Active':
        return

    job_execution = _update_job_execution_extra(job_execution, cluster)

    job = conductor.job_get(ctx, job_execution.job_id)
    input_source, output_source = _get_data_sources(job_execution, job)

    for data_source in [input_source, output_source]:
        if data_source and data_source.type == 'hdfs':
            h.configure_cluster_for_hdfs(cluster, data_source)

    plugin = _get_plugin(cluster)
    hdfs_user = plugin.get_hdfs_user()
    oozie_server = plugin.get_oozie_server(cluster)

    wf_dir = create_workflow_dir(oozie_server, job, hdfs_user)
    upload_job_files(oozie_server, wf_dir, job, hdfs_user)

    wf_xml = workflow_factory.get_workflow_xml(
        job, cluster, job_execution, input_source, output_source)

    path_to_workflow = upload_workflow_file(oozie_server,
                                            wf_dir, wf_xml, hdfs_user)

    client = _create_oozie_client(cluster)
    job_params = _get_oozie_job_params(cluster, hdfs_user, path_to_workflow)
    oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                  job_execution)
    job_execution = conductor.job_execution_update(
        ctx, job_execution, {'oozie_job_id': oozie_job_id,
                             'start_time': datetime.datetime.now()})
    client.run_job(job_execution, oozie_job_id)
Ejemplo n.º 25
0
    def run_scheduled_job(self, job_execution):
        prepared_job_params = self._prepare_run_job(job_execution)
        oozie_server = prepared_job_params['oozie_server']
        wf_dir = prepared_job_params['wf_dir']
        hdfs_user = prepared_job_params['hdfs_user']
        oozie_params = prepared_job_params['oozie_params']
        use_hbase_lib = prepared_job_params['use_hbase_lib']
        ctx = prepared_job_params['context']
        job_execution = prepared_job_params['job_execution']

        coord_configs = {
            "jobTracker": "${jobTracker}",
            "nameNode": "${nameNode}"
        }

        coord_xml = self._create_coordinator_xml(coord_configs)

        self._upload_coordinator_file(oozie_server, wf_dir, coord_xml,
                                      hdfs_user)
        job_params = self._get_oozie_job_params(
            hdfs_user, None, oozie_params, use_hbase_lib,
            job_execution.job_configs.job_execution_info, wf_dir, "scheduled")

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)

        job_execution = conductor.job_execution_get(ctx, job_execution.id)

        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Ejemplo n.º 26
0
    def run_scheduled_job(self, job_execution):
        prepared_job_params = self._prepare_run_job(job_execution)
        oozie_server = prepared_job_params['oozie_server']
        wf_dir = prepared_job_params['wf_dir']
        hdfs_user = prepared_job_params['hdfs_user']
        oozie_params = prepared_job_params['oozie_params']
        use_hbase_lib = prepared_job_params['use_hbase_lib']
        ctx = prepared_job_params['context']
        job_execution = prepared_job_params['job_execution']

        coord_configs = {"jobTracker": "${jobTracker}",
                         "nameNode": "${nameNode}"}

        coord_xml = self._create_coordinator_xml(coord_configs)

        self._upload_coordinator_file(oozie_server, wf_dir, coord_xml,
                                      hdfs_user)
        job_params = self._get_oozie_job_params(
            hdfs_user, None, oozie_params, use_hbase_lib,
            job_execution.job_configs.job_execution_info, wf_dir,
            "scheduled")

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)

        job_execution = conductor.job_execution_get(ctx, job_execution.id)

        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Ejemplo n.º 27
0
def _generate_xml(configs):
    xml_confs = {}
    for service, confs in six.iteritems(configs):
        xml_confs[service] = x.create_hadoop_xml(confs)

    return xml_confs
Ejemplo n.º 28
0
def _generate_xml(configs):
    xml_confs = {}
    for service, confs in six.iteritems(configs):
        xml_confs[service] = x.create_hadoop_xml(confs)

    return xml_confs
Ejemplo n.º 29
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        data_source_urls = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(job_execution, job, data_source_urls)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = job_utils.resolve_data_source_references(
            job_execution.job_configs, job_execution.id, data_source_urls
        )

        job_execution = conductor.job_execution_update(ctx, job_execution, {"data_source_urls": data_source_urls})

        proxy_configs = updated_job_configs.get("proxy_configs")
        configs = updated_job_configs.get("configs", {})
        use_hbase_lib = configs.get("edp.hbase_common_lib", {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith("oozie."):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == "hdfs":
                h.configure_cluster_for_hdfs(self.cluster, data_source_urls[data_source.id])
                break

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs, proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs, input_source, output_source, hdfs_user, data_source_urls
        )

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir, wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user, path_to_workflow, oozie_params, use_hbase_lib)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params), job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info["status"] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)
        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution, oozie_job_id)["status"]
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Ejemplo n.º 30
0
def to_xml_file_content(data):
    return x.create_hadoop_xml(data)
Ejemplo n.º 31
0
def run_job(job_execution_id):
    ctx = context.ctx()

    job_execution = conductor.job_execution_get(ctx,
                                                job_execution_id)

    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != 'Active':
        return

    if CONF.use_namespaces and not CONF.use_floating_ips:
        plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name)
        oozie = plugin.get_oozie_server(cluster)

        info = oozie.remote().get_neutron_info()
        extra = job_execution.extra.copy()
        extra['neutron'] = info

        job_execution = conductor.job_execution_update(ctx,
                                                       job_execution_id,
                                                       {'extra': extra})

    job = conductor.job_get(ctx, job_execution.job_id)
    if not edp.compare_job_type(job.type, edp.JOB_TYPE_JAVA):
        input_source = conductor.data_source_get(ctx,  job_execution.input_id)
        output_source = conductor.data_source_get(ctx, job_execution.output_id)
    else:
        input_source = None
        output_source = None

    for data_source in [input_source, output_source]:
        if data_source and data_source.type == 'hdfs':
            h.configure_cluster_for_hdfs(cluster, data_source)

    hdfs_user = _get_hdfs_user(cluster)
    oozie_server = _get_oozie_server(cluster)
    wf_dir = create_workflow_dir(oozie_server, job, hdfs_user)
    upload_job_files(oozie_server, wf_dir, job, hdfs_user)

    creator = workflow_factory.get_creator(job)

    wf_xml = creator.get_workflow_xml(cluster, job_execution,
                                      input_source, output_source)

    path_to_workflow = upload_workflow_file(oozie_server,
                                            wf_dir, wf_xml, hdfs_user)

    plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name)
    rm_path = plugin.get_resource_manager_uri(cluster)
    nn_path = plugin.get_name_node_uri(cluster)

    client = _create_oozie_client(cluster)
    job_parameters = {"jobTracker": rm_path,
                      "nameNode": nn_path,
                      "user.name": hdfs_user,
                      "oozie.wf.application.path":
                      "%s%s" % (nn_path, path_to_workflow),
                      "oozie.use.system.libpath": "true"}

    oozie_job_id = client.add_job(x.create_hadoop_xml(job_parameters),
                                  job_execution)
    job_execution = conductor.job_execution_update(ctx, job_execution,
                                                   {'oozie_job_id':
                                                    oozie_job_id,
                                                    'start_time':
                                                    datetime.datetime.now()})
    client.run_job(job_execution, oozie_job_id)
Ejemplo n.º 32
0
 def render(self):
     return xml.create_hadoop_xml(self._config_dict)
Ejemplo n.º 33
0
 def render(self):
     return xml.create_hadoop_xml(self._config_dict)
Ejemplo n.º 34
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job, data_source_urls, self.cluster)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})
        use_hbase_lib = configs.get('edp.hbase_common_lib', {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith('oozie.'):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(
                    self.cluster, data_source_urls[data_source.id])
                break

        external_hdfs_urls = self._resolve_external_hdfs_urls(
            job_execution.job_configs)
        for url in external_hdfs_urls:
            h.configure_cluster_for_hdfs(self.cluster, url)

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs,
            input_source, output_source,
            hdfs_user, data_source_urls)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow,
                                                oozie_params,
                                                use_hbase_lib)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        conductor.job_execution_update(
            context.ctx(), job_execution.id,
            {'info': {'status': edp.JOB_STATUS_READYTORUN},
             'engine_job_id': oozie_job_id})

        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_info(job_execution, oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)
Ejemplo n.º 35
0
def create_hadoop_xml(configs, config_filter=None, **kwargs):
    return xmlutils.create_hadoop_xml(configs, config_filter)
Ejemplo n.º 36
0
def to_xml_file_content(data):
    return x.create_hadoop_xml(data)
Ejemplo n.º 37
0
    def run_job(self, job_execution):
        ctx = context.ctx()

        # This will be a dictionary of tuples, (native_url, runtime_url)
        # keyed by data_source id
        data_source_urls = {}

        job = conductor.job_get(ctx, job_execution.job_id)
        input_source, output_source = job_utils.get_data_sources(
            job_execution, job, data_source_urls, self.cluster)

        # Updated_job_configs will be a copy of job_execution.job_configs with
        # any name or uuid references to data_sources resolved to paths
        # assuming substitution is enabled.
        # If substitution is not enabled then updated_job_configs will
        # just be a reference to job_execution.job_configs to avoid a copy.
        # Additional_sources will be a list of any data_sources found.
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls,
                                                     self.cluster)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution,
            {"data_source_urls": job_utils.to_url_dict(data_source_urls)})

        # Now that we've recorded the native urls, we can switch to the
        # runtime urls
        data_source_urls = job_utils.to_url_dict(data_source_urls,
                                                 runtime=True)

        proxy_configs = updated_job_configs.get('proxy_configs')
        configs = updated_job_configs.get('configs', {})
        use_hbase_lib = configs.get('edp.hbase_common_lib', {})

        # Extract all the 'oozie.' configs so that they can be set in the
        # job properties file. These are config values for Oozie itself,
        # not the job code
        oozie_params = {}
        for k in list(configs):
            if k.startswith('oozie.'):
                oozie_params[k] = configs[k]

        for data_source in [input_source, output_source] + additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(
                    self.cluster, data_source_urls[data_source.id])
                break

        external_hdfs_urls = self._resolve_external_hdfs_urls(
            job_execution.job_configs)
        for url in external_hdfs_urls:
            h.configure_cluster_for_hdfs(self.cluster, url)

        hdfs_user = self.get_hdfs_user()

        # TODO(tmckay): this should probably be "get_namenode"
        # but that call does not exist in the oozie engine api now.
        oozie_server = self.get_oozie_server(self.cluster)

        wf_dir = self._create_hdfs_workflow_dir(oozie_server, job)
        self._upload_job_files_to_hdfs(oozie_server, wf_dir, job, configs,
                                       proxy_configs)

        wf_xml = workflow_factory.get_workflow_xml(
            job, self.cluster, updated_job_configs,
            input_source, output_source,
            hdfs_user, data_source_urls)

        path_to_workflow = self._upload_workflow_file(oozie_server, wf_dir,
                                                      wf_xml, hdfs_user)

        job_params = self._get_oozie_job_params(hdfs_user,
                                                path_to_workflow,
                                                oozie_params,
                                                use_hbase_lib)

        client = self._get_client()
        oozie_job_id = client.add_job(x.create_hadoop_xml(job_params),
                                      job_execution)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        conductor.job_execution_update(
            context.ctx(), job_execution.id,
            {'info': {'status': edp.JOB_STATUS_READYTORUN},
             'engine_job_id': oozie_job_id})

        client.run_job(job_execution, oozie_job_id)
        try:
            status = client.get_job_status(job_execution,
                                           oozie_job_id)['status']
        except Exception:
            status = None
        return (oozie_job_id, status, None)