Example #1
0
 def _write_result(self, status, description):
     cond.cluster_health_check_update(
         context.ctx(), self.health_check_id,
         {'status': status, 'description': description})
     self.health_check = cond.cluster_health_check_get(
         context.ctx(), self.health_check_id)
     sender.health_notify(self.cluster, self.health_check)
Example #2
0
File: api.py Project: rsaha/sahara
def execute_job(job_id, data):
    # Elements common to all job types
    cluster_id = data['cluster_id']
    configs = data.get('job_configs', {})
    interface = data.get('interface', {})

    # Not in Java job types but present for all others
    input_id = data.get('input_id', None)
    output_id = data.get('output_id', None)

    # Since we will use a unified class in the database, we pass
    # a superset for all job types
    job_ex_dict = {'input_id': input_id, 'output_id': output_id,
                   'job_id': job_id, 'cluster_id': cluster_id,
                   'info': {'status': edp.JOB_STATUS_PENDING},
                   'job_configs': configs, 'extra': {},
                   'interface': interface}
    job_execution = conductor.job_execution_create(context.ctx(), job_ex_dict)
    context.set_current_job_execution_id(job_execution.id)

    # check to use proxy user
    if p.job_execution_requires_proxy_user(job_execution):
        try:
            p.create_proxy_user_for_job_execution(job_execution)
        except ex.SaharaException as e:
            LOG.error(_LE("Can't run job execution. "
                          "(Reasons: {reason})").format(reason=e))
            conductor.job_execution_destroy(context.ctx(), job_execution)
            raise e

    OPS.run_edp_job(job_execution.id)

    return job_execution
Example #3
0
    def test_verification_start(self, get_health_checks):
        cluster = self._cluster_sample()
        get_health_checks.return_value = [Check]
        verification_base.handle_verification(cluster, {
            'verification': {'status': 'START'}})
        cluster = self.api.cluster_get(context.ctx(), cluster)
        ver = cluster.verification
        self.assertEqual('GREEN', ver['status'])
        self.assertEqual(1, len(ver['checks']))

        self.assertEqual('No criminality', ver.checks[0]['description'])
        id = ver['id']

        get_health_checks.return_value = [YellowCheck, Check, Check]

        verification_base.handle_verification(cluster, {
            'verification': {'status': 'START'}})
        cluster = self.api.cluster_get(context.ctx(), cluster)
        ver = cluster.verification

        self.assertEqual('YELLOW', ver['status'])
        self.assertEqual(3, len(ver['checks']))
        self.assertNotEqual(ver['id'], id)

        get_health_checks.return_value = [RedCheck, YellowCheck]

        verification_base.handle_verification(cluster, {
            'verification': {'status': 'START'}})
        cluster = self.api.cluster_get(context.ctx(), cluster)
        ver = cluster.verification

        self.assertEqual('RED', ver['status'])
        self.assertEqual(2, len(ver['checks']))
        self.assertNotEqual(ver['id'], id)
        self.assertEqual("James bond check", ver['checks'][0]['name'])
Example #4
0
def job_execution_requires_proxy_user(job_execution):
    '''Returns True if the job execution requires a proxy user.'''

    def _check_values(values):
        return any(value.startswith(
            su.SWIFT_INTERNAL_PREFIX) for value in values if (
                isinstance(value, six.string_types)))

    if CONF.use_domain_for_proxy_users is False:
        return False

    paths = [conductor.data_source_get(context.ctx(), job_execution.output_id),
             conductor.data_source_get(context.ctx(), job_execution.input_id)]
    if _check_values(ds.url for ds in paths if ds):
        return True

    if _check_values(six.itervalues(
            job_execution.job_configs.get('configs', {}))):
        return True

    if _check_values(six.itervalues(
            job_execution.job_configs.get('params', {}))):
        return True

    if _check_values(job_execution.job_configs.get('args', [])):
        return True

    job = conductor.job_get(context.ctx(), job_execution.job_id)
    if _check_values(main.url for main in job.mains):
        return True

    if _check_values(lib.url for lib in job.libs):
        return True

    # We did the simple checks, now if data_source referencing is
    # enabled and we have values that could be a name or uuid,
    # query for data_sources that match and contain a swift path
    by_name, by_uuid = job_utils.may_contain_data_source_refs(
        job_execution.job_configs)
    if by_name:
        names = tuple(job_utils.find_possible_data_source_refs_by_name(
            job_execution.job_configs))
        # do a query here for name in names and path starts with swift-prefix
        if names and conductor.data_source_count(
                context.ctx(),
                name=names,
                url=su.SWIFT_INTERNAL_PREFIX+'%') > 0:
            return True

    if by_uuid:
        uuids = tuple(job_utils.find_possible_data_source_refs_by_uuid(
            job_execution.job_configs))
        # do a query here for id in uuids and path starts with swift-prefix
        if uuids and conductor.data_source_count(
                context.ctx(),
                id=uuids,
                url=su.SWIFT_INTERNAL_PREFIX+'%') > 0:
            return True

    return False
Example #5
0
 def _generate_heat_stack_name(cluster):
     cluster = conductor.cluster_get(context.ctx(), cluster)
     hsn = cluster.name + cluster.id[:8]
     extra = cluster.extra.to_dict() if cluster.extra else {}
     extra['heat_stack_name'] = hsn
     conductor.cluster_update(context.ctx(), cluster, {'extra': extra})
     return conductor.cluster_get(context.ctx(), cluster)
Example #6
0
    def test_get_hadoop_ssh_keys(self):
        cluster_dict = {
            'name': 'cluster1',
            'plugin_name': 'mock_plugin',
            'hadoop_version': 'mock_version',
            'default_image_id': 'initial',
            'node_groups': [tu.make_ng_dict("ng1", "f1", ["s1"], 1)]}

        cluster1 = conductor.cluster_create(context.ctx(), cluster_dict)
        (private_key1, public_key1) = c_h.get_hadoop_ssh_keys(cluster1)

        #should store keys for old cluster
        cluster1 = conductor.cluster_get(context.ctx(), cluster1)
        (private_key2, public_key2) = c_h.get_hadoop_ssh_keys(cluster1)

        self.assertEqual(public_key1, public_key2)
        self.assertEqual(private_key1, private_key2)

        #should generate new keys for new cluster
        cluster_dict.update({'name': 'cluster2'})
        cluster2 = conductor.cluster_create(context.ctx(), cluster_dict)
        (private_key3, public_key3) = c_h.get_hadoop_ssh_keys(cluster2)

        self.assertNotEqual(public_key1, public_key3)
        self.assertNotEqual(private_key1, private_key3)
Example #7
0
def get_oozie_password(cluster):
    cluster = conductor.cluster_get(context.ctx(), cluster)
    extra = cluster.extra.to_dict()
    if 'oozie_pass_id' not in extra:
        extra['oozie_pass_id'] = u.generate_random_password()
        conductor.cluster_update(context.ctx(), cluster, {'extra': extra})
    return castellan.get_secret(extra['oozie_pass_id'])
Example #8
0
    def update_plugin(self, plugin_name, values):
        ctx = context.ctx()
        current = self.get_label_details(plugin_name)
        if not conductor.plugin_get(ctx, plugin_name):
            current['name'] = plugin_name
            conductor.plugin_create(ctx, current)
            del current['name']

        if values.get(PLUGIN_LABELS_SCOPE):
            for label in values.get(PLUGIN_LABELS_SCOPE).keys():
                current[PLUGIN_LABELS_SCOPE][label].update(
                    values.get(PLUGIN_LABELS_SCOPE).get(label))
        else:
            del current[PLUGIN_LABELS_SCOPE]

        if values.get(VERSION_LABELS_SCOPE):
            vl = values.get(VERSION_LABELS_SCOPE)
            for version in vl.keys():
                for label in vl.get(version).keys():
                    current[VERSION_LABELS_SCOPE][version][label].update(
                        vl[version][label])
        else:
            del current[VERSION_LABELS_SCOPE]

        conductor.plugin_update(context.ctx(), plugin_name, current)
Example #9
0
    def test_get_instances(self):
        cluster = self._make_sample()
        ctx = context.ctx()
        idx = 0
        ids = []
        for ng in cluster.node_groups:
            for i in range(ng.count):
                idx += 1
                ids.append(self.api.instance_add(context.ctx(), ng, {
                    'instance_id': str(idx),
                    'instance_name': str(idx),
                }))
        cluster = self.api.cluster_get(ctx, cluster)
        instances = general.get_instances(cluster, ids)
        ids = set()
        for inst in instances:
            ids.add(inst.instance_id)
        self.assertEqual(idx, len(ids))
        for i in range(1, idx):
            self.assertIn(str(i), ids)

        instances = general.get_instances(cluster)
        ids = set()
        for inst in instances:
            ids.add(inst.instance_id)
        self.assertEqual(idx, len(ids))
        for i in range(1, idx):
            self.assertIn(str(i), ids)
Example #10
0
def check_data_sources_are_different(data_source_1_id, data_source_2_id):
    ds1 = conductor.data_source_get(context.ctx(), data_source_1_id)
    ds2 = conductor.data_source_get(context.ctx(), data_source_2_id)

    if ds1.type == ds2.type and ds1.url == ds2.url:
        raise ex.InvalidDataException(_('Provided input and output '
                                        'DataSources reference the same '
                                        'location: %s') % ds1.url)
Example #11
0
def clean_verification_data(cluster):
    cluster = cond.cluster_get(context.ctx(), cluster)
    if verification_exists(cluster):
        try:
            vid = cluster.verification.id
            cond.cluster_verification_delete(context.ctx(), vid)
        except exceptions.NotFoundException:
            LOG.debug("Verification data already cleaned")
Example #12
0
 def _indicate_start(self):
     vid = self.cluster.verification.id
     self.health_check_id = cond.cluster_health_check_add(
         context.ctx(), vid, {'status': common.HEALTH_STATUS_CHECKING,
                              'name': self.get_health_check_name()}).id
     self.health_check = cond.cluster_health_check_get(
         context.ctx(), self.health_check_id)
     sender.health_notify(self.cluster, self.health_check)
    def test_apply_recommended_configs(self, cond_cluster, cond_node_group,
                                       fake_flavor):
        fake_flavor.return_value = FakeObject(ram=2048, vcpus=1)
        to_tune = {
            'cluster_configs': {
                'dfs.replication': ('dfs', 'replica')
            },
            'node_configs': {
                'mapreduce.task.io.sort.mb': ('bond', 'extra_name')
            }
        }

        fake_plugin_configs = [
            FakeObject(applicable_target='dfs', name='replica',
                       default_value=3)]
        fake_ng = FakeObject(
            use_autoconfig=True,
            count=2,
            node_processes=['dog_datanode'],
            flavor_id='fake_id',
            node_configs=Configs({
                'bond': {
                    'name': 'james'
                }
            })
        )
        fake_cluster = FakeObject(
            cluster_configs=Configs({
                'cat': {
                    'talk': 'meow',
                }
            }),
            node_groups=[fake_ng],
            use_autoconfig=True,
        )
        v = ru.HadoopAutoConfigsProvider(
            to_tune, fake_plugin_configs, fake_cluster,
            {'datanode_process_name': "dog_datanode"})

        v.apply_recommended_configs()
        self.assertEqual([mock.call(context.ctx(), fake_cluster, {
            'cluster_configs': {
                'cat': {
                    'talk': 'meow'
                },
                'dfs': {
                    'replica': 2
                }
            }
        })], cond_cluster.call_args_list)
        self.assertEqual([mock.call(context.ctx(), fake_ng, {
            'node_configs': {
                'bond': {
                    'name': 'james',
                    'extra_name': 102
                }
            }
        })], cond_node_group.call_args_list)
Example #14
0
def update_cluster(id, values):
    if "update_keypair" in values:
        if values["update_keypair"]:
            api.OPS.update_keypair(id)
        values.pop("update_keypair")
    if verification_base.update_verification_required(values):
        api.OPS.handle_verification(id, values)
        return conductor.cluster_get(context.ctx(), id)
    return conductor.cluster_update(context.ctx(), id, values)
Example #15
0
def get_raw_binary(job_binary):
    url = job_binary.url
    if url.startswith("internal-db://"):
        res = db.get_raw_data(context.ctx(), job_binary)

    # TODO(mattf): remove support for OLD_SWIFT_INTERNAL_PREFIX
    if url.startswith(su.SWIFT_INTERNAL_PREFIX) or (
            url.startswith(su.OLD_SWIFT_INTERNAL_PREFIX)):
        res = i_swift.get_raw_data(context.ctx(), job_binary)

    return res
Example #16
0
    def _set_cluster_info(self, cluster):
        nn = vu.get_namenode(cluster)
        rm = vu.get_resourcemanager(cluster)
        hs = vu.get_historyserver(cluster)
        oo = vu.get_oozie(cluster)

        info = {}

        if rm:
            info['YARN'] = {
                'Web UI': 'http://%s:%s' % (rm.management_ip, '8088'),
                'ResourceManager': 'http://%s:%s' % (rm.management_ip, '8032')
            }

        if nn:
            info['HDFS'] = {
                'Web UI': 'http://%s:%s' % (nn.management_ip, '50070'),
                'NameNode': 'hdfs://%s:%s' % (nn.hostname(), '9000')
            }

        if oo:
            info['JobFlow'] = {
                'Oozie': 'http://%s:%s' % (oo.management_ip, '11000')
            }

        if hs:
            info['MapReduce JobHistory Server'] = {
                'Web UI': 'http://%s:%s' % (hs.management_ip, '19888')
            }

        ctx = context.ctx()
        conductor.cluster_update(ctx, cluster, {'info': info})
Example #17
0
    def _await_networks(self, cluster, instances):
        if not instances:
            return

        ips_assigned = set()
        while len(ips_assigned) != len(instances):
            if not g.check_cluster_exists(cluster):
                return
            for instance in instances:
                if instance.id not in ips_assigned:
                    if networks.init_instances_ips(instance):
                        ips_assigned.add(instance.id)

            context.sleep(1)

        LOG.info(
            _LI("Cluster '%s': all instances have IPs assigned"), cluster.id)

        cluster = conductor.cluster_get(context.ctx(), cluster)
        instances = g.get_instances(cluster, ips_assigned)

        with context.ThreadGroup() as tg:
            for instance in instances:
                tg.spawn("wait-for-ssh-%s" % instance.instance_name,
                         self._wait_until_accessible, instance)

        LOG.info(_LI("Cluster '%s': all instances are accessible"), cluster.id)
Example #18
0
    def test_transient_cluster_terminate(self, terminate_cluster,
                                         use_os_admin_auth_token):

        timeutils.set_time_override(datetime.datetime(2005, 2, 1, 0, 0))

        ctx = context.ctx()
        job = self.api.job_create(ctx, te.SAMPLE_JOB)
        ds = self.api.data_source_create(ctx, te.SAMPLE_DATA_SOURCE)

        self._make_cluster('1')
        self._make_cluster('2')

        self._create_job_execution({"end_time": timeutils.utcnow(),
                                    "id": 1,
                                    "cluster_id": "1"},
                                   job, ds, ds)
        self._create_job_execution({"end_time": None,
                                    "id": 2,
                                    "cluster_id": "2"},
                                   job, ds, ds)
        self._create_job_execution({"end_time": None,
                                    "id": 3,
                                    "cluster_id": "2"},
                                   job, ds, ds)

        timeutils.set_time_override(datetime.datetime(2005, 2, 1, 0, 1))

        p._make_periodic_tasks().terminate_unneeded_transient_clusters(None)
        self.assertEqual(1, terminate_cluster.call_count)
        terminate_cluster.assert_has_calls([mock.call(u'1')])
        self.assertEqual(1, use_os_admin_auth_token.call_count)
Example #19
0
    def apply_node_configs(self, node_group):
        """Method applies configs for node_group using conductor api,

        which were calculated with recommend_node_configs method.
        :param node_group: NodeGroup Sahara resource.
        :return: None.
        """
        if not node_group.use_autoconfig or not self.cluster.use_autoconfig:
            return
        to_update = self.node_configs_to_update
        recommended_node_configs = self._get_recommended_node_configs(
            node_group)
        if not recommended_node_configs:
            # Nothing to configure
            return
        current_dict = node_group.node_configs.to_dict()
        configuration = {}
        for ncfg in six.iterkeys(to_update):
            if ncfg not in recommended_node_configs:
                continue
            n_section = to_update[ncfg][0]
            n_name = to_update[ncfg][1]
            proposed_config_value = recommended_node_configs[ncfg]
            if n_section not in configuration:
                configuration.update({n_section: {}})
            configuration[n_section].update({n_name: proposed_config_value})
        current_dict = self._merge_configs(current_dict, configuration)
        conductor.node_group_update(context.ctx(), node_group,
                                    {'node_configs': current_dict})
Example #20
0
    def apply_cluster_configs(self):
        """Method applies configs for cluster using conductor api, which were

        calculated with recommend_cluster_configs method.
        :return: None.
        """
        cluster = self.cluster
        if not cluster.use_autoconfig:
            return
        to_update = self.cluster_configs_to_update
        recommended_cluster_configs = self._get_recommended_cluster_configs()
        if not recommended_cluster_configs:
            # Nothing to configure
            return
        current_dict = cluster.cluster_configs.to_dict()
        configuration = {}
        for ncfg in six.iterkeys(to_update):
            if ncfg not in recommended_cluster_configs:
                continue
            n_section = to_update[ncfg][0]
            n_name = to_update[ncfg][1]
            proposed_config_value = recommended_cluster_configs[ncfg]
            if n_section not in configuration:
                configuration.update({n_section: {}})
            configuration[n_section].update({n_name: proposed_config_value})
        current_dict = self._merge_configs(current_dict, configuration)
        conductor.cluster_update(context.ctx(), cluster,
                                 {'cluster_configs': current_dict})
Example #21
0
    def _await_networks(self, cluster, instances):
        if not instances:
            return

        cpo.add_provisioning_step(cluster.id, _("Assign IPs"), len(instances))

        ips_assigned = set()
        self._ips_assign(ips_assigned, cluster, instances)

        LOG.info(
            _LI("Cluster {cluster_id}: all instances have IPs assigned")
            .format(cluster_id=cluster.id))

        cluster = conductor.cluster_get(context.ctx(), cluster)
        instances = g.get_instances(cluster, ips_assigned)

        cpo.add_provisioning_step(
            cluster.id, _("Wait for instance accessibility"), len(instances))

        with context.ThreadGroup() as tg:
            for instance in instances:
                tg.spawn("wait-for-ssh-%s" % instance.instance_name,
                         self._wait_until_accessible, instance)

        LOG.info(_LI("Cluster {cluster_id}: all instances are accessible")
                 .format(cluster_id=cluster.id))
Example #22
0
 def finalize_autoconfiguration(self):
     if not self.cluster.use_autoconfig:
         return
     cluster_extra = self._get_cluster_extra()
     cluster_extra['auto-configured'] = True
     conductor.cluster_update(
         context.ctx(), self.cluster, {'extra': cluster_extra})
Example #23
0
    def test_data_source_count_in(self):
        ctx = context.ctx()
        ctx.tenant_id = SAMPLE_DATA_SOURCE['tenant_id']
        src = copy.copy(SAMPLE_DATA_SOURCE)
        self.api.data_source_create(ctx, src)

        cnt = self.api.data_source_count(ctx, name='ngt_test')
        self.assertEqual(1, cnt)

        cnt = self.api.data_source_count(ctx, name=('ngt_test',
                                                    'test2', 'test3'))
        self.assertEqual(1, cnt)

        cnt = self.api.data_source_count(ctx, name=('test1',
                                                    'test2', 'test3'))
        self.assertEqual(0, cnt)

        lst = self.api.data_source_get_all(ctx, name='ngt_test')
        myid = lst[0]['id']
        cnt = self.api.data_source_count(ctx,
                                         name=('ngt_test', 'test2', 'test3'),
                                         id=myid)
        self.assertEqual(1, cnt)

        cnt = self.api.data_source_count(ctx,
                                         name=('ngt_test', 'test2', 'test3'),
                                         id=(myid, '2'))
        self.assertEqual(1, cnt)
Example #24
0
    def test_job_execution_search(self):
        ctx = context.ctx()
        job = self.api.job_create(ctx, SAMPLE_JOB)
        ds_input = self.api.data_source_create(ctx, SAMPLE_DATA_SOURCE)
        SAMPLE_DATA_OUTPUT = copy.copy(SAMPLE_DATA_SOURCE)
        SAMPLE_DATA_OUTPUT['name'] = 'output'
        ds_output = self.api.data_source_create(ctx, SAMPLE_DATA_OUTPUT)

        SAMPLE_JOB_EXECUTION['job_id'] = job['id']
        SAMPLE_JOB_EXECUTION['input_id'] = ds_input['id']
        SAMPLE_JOB_EXECUTION['output_id'] = ds_output['id']

        ctx.tenant_id = SAMPLE_JOB_EXECUTION['tenant_id']
        self.api.job_execution_create(ctx, SAMPLE_JOB_EXECUTION)

        lst = self.api.job_execution_get_all(ctx)
        self.assertEqual(1, len(lst))

        kwargs = {'tenant_id': SAMPLE_JOB_EXECUTION['tenant_id']}
        lst = self.api.job_execution_get_all(ctx, **kwargs)
        self.assertEqual(1, len(lst))

        # Valid field but no matching value
        kwargs = {'job_id': SAMPLE_JOB_EXECUTION['job_id']+"foo"}
        lst = self.api.job_execution_get_all(ctx, **kwargs)
        self.assertEqual(0, len(lst))

        # Invalid field
        self.assertRaises(sa_exc.InvalidRequestError,
                          self.api.job_execution_get_all,
                          ctx, **{'badfield': 'somevalue'})
Example #25
0
def change_cluster_status_description(cluster, status_description):
    try:
        ctx = context.ctx()
        return conductor.cluster_update(
            ctx, cluster, {'status_description': status_description})
    except e.NotFoundException:
        return None
Example #26
0
def change_cluster_status(cluster, status, status_description=None):
    ctx = context.ctx()

    # Update cluster status. Race conditions with deletion are still possible,
    # but this reduces probability at least.
    cluster = conductor.cluster_get(ctx, cluster) if cluster else None

    if status_description is not None:
        change_cluster_status_description(cluster, status_description)

    # 'Deleting' is final and can't be changed
    if cluster is None or cluster.status == CLUSTER_STATUS_DELETING:
        return cluster

    update_dict = {"status": status}
    cluster = conductor.cluster_update(ctx, cluster, update_dict)
    conductor.cluster_provision_progress_update(ctx, cluster.id)

    LOG.info(_LI("Cluster status has been changed. New status="
                 "{status}").format(status=cluster.status))

    sender.notify(ctx, cluster.id, cluster.name, cluster.status,
                  "update")

    return cluster
Example #27
0
    def _set_cluster_info(self, cluster):
        nn = vu.get_namenode(cluster)
        jt = vu.get_jobtracker(cluster)
        oozie = vu.get_oozie(cluster)
        info = {}

        if jt:
            ui_port = c_helper.get_port_from_config("MapReduce", "mapred.job.tracker.http.address", cluster)
            jt_port = c_helper.get_port_from_config("MapReduce", "mapred.job.tracker", cluster)

            info["MapReduce"] = {
                "Web UI": "http://%s:%s" % (jt.management_ip, ui_port),
                "JobTracker": "%s:%s" % (jt.hostname(), jt_port),
            }

        if nn:
            ui_port = c_helper.get_port_from_config("HDFS", "dfs.http.address", cluster)
            nn_port = c_helper.get_port_from_config("HDFS", "fs.default.name", cluster)

            info["HDFS"] = {
                "Web UI": "http://%s:%s" % (nn.management_ip, ui_port),
                "NameNode": "hdfs://%s:%s" % (nn.hostname(), nn_port),
            }

        if oozie:
            # TODO(yrunts) change from hardcode value
            info["JobFlow"] = {"Oozie": "http://%s:11000" % oozie.management_ip}

        ctx = context.ctx()
        conductor.cluster_update(ctx, cluster, {"info": info})
Example #28
0
def _run_job(job_execution_id):
    ctx = context.ctx()
    job_execution = conductor.job_execution_get(ctx, job_execution_id)
    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster.status != c_u.CLUSTER_STATUS_ACTIVE:
        return

    eng = _get_job_engine(cluster, job_execution)
    if eng is None:
        raise e.EDPError(_("Cluster does not support job type %s")
                         % _get_job_type(job_execution))
    job_execution = _update_job_execution_extra(cluster, job_execution)

    # Job id is a string
    # Status is a string
    # Extra is a dictionary to add to extra in the job_execution
    jid, status, extra = eng.run_job(job_execution)

    # Set the job id and the start time
    # Optionally, update the status and the 'extra' field
    update_dict = {'oozie_job_id': jid,
                   'start_time': datetime.datetime.now()}
    if status:
        update_dict['info'] = {'status': status}
    if extra:
        curr_extra = job_execution.extra.copy()
        curr_extra.update(extra)
        update_dict['extra'] = curr_extra

    job_execution = conductor.job_execution_update(
        ctx, job_execution, update_dict)
Example #29
0
    def _shutdown_instance(self, instance):
        ctx = context.ctx()

        if instance.node_group.floating_ip_pool:
            try:
                networks.delete_floating_ip(instance.instance_id)
            except nova_exceptions.NotFound:
                LOG.warn(_LW("Attempted to delete non-existent floating IP in "
                         "pool %(pool)s from instance %(instance)s"),
                         {'pool': instance.node_group.floating_ip_pool,
                          'instance': instance.instance_id})

        try:
            volumes.detach_from_instance(instance)
        except Exception:
            LOG.warn(_LW("Detaching volumes from instance %s failed"),
                     instance.instance_id)

        try:
            nova.client().servers.delete(instance.instance_id)
        except nova_exceptions.NotFound:
            LOG.warn(_LW("Attempted to delete non-existent instance %s"),
                     instance.instance_id)

        conductor.instance_remove(ctx, instance)
Example #30
0
    def scale_cluster(self, cluster, node_group_id_map):
        ctx = context.ctx()
        cluster = g.change_cluster_status(cluster, "Scaling")

        instance_ids = self._scale_cluster_instances(cluster,
                                                     node_group_id_map)

        self._update_rollback_strategy(cluster, instance_ids=instance_ids)

        cluster = conductor.cluster_get(ctx, cluster)
        g.clean_cluster_from_empty_ng(cluster)

        cluster = conductor.cluster_get(ctx, cluster)
        instances = g.get_instances(cluster, instance_ids)

        self._await_active(cluster, instances)

        self._assign_floating_ips(instances)

        self._await_networks(cluster, instances)

        cluster = conductor.cluster_get(ctx, cluster)

        volumes.attach_to_instances(
            g.get_instances(cluster, instance_ids))

        # we should be here with valid cluster: if instances creation
        # was not successful all extra-instances will be removed above
        if instance_ids:
            self._configure_instances(cluster)

        self._update_rollback_strategy(cluster)

        return instance_ids
Example #31
0
def update_job_execution(id, values):
    _update_status(values.pop("info", None))
    return conductor.job_execution_update(context.ctx(), id, values)
Example #32
0
def get_data_sources(**kwargs):
    return conductor.data_source_get_all(context.ctx(),
                                         regex_search=True,
                                         **kwargs)
Example #33
0
def update_job(id, values):
    return conductor.job_update(context.ctx(), id, values)
Example #34
0
def get_data_source(id):
    return conductor.data_source_get(context.ctx(), id)
Example #35
0
def create_job_binary_internal(values):
    return conductor.job_binary_internal_create(context.ctx(), values)
Example #36
0
def _setup_trust_for_cluster(cluster):
    cluster = conductor.cluster_get(context.ctx(), cluster)
    trusts.create_trust_for_cluster(cluster)
    trusts.use_os_admin_auth_token(cluster)
Example #37
0
def register_data_source(values):
    return conductor.data_source_create(context.ctx(), values)
Example #38
0
def get_cluster_templates(**kwargs):
    return conductor.cluster_template_get_all(context.ctx(),
                                              regex_search=True, **kwargs)
Example #39
0
def update_cluster(id, values):
    if verification_base.update_verification_required(values):
        api.OPS.handle_verification(id, values)
        return conductor.cluster_get(context.ctx(), id)
    return conductor.cluster_update(context.ctx(), id, values)
Example #40
0
def data_source_update(id, values):
    return conductor.data_source_update(context.ctx(), id, values)
Example #41
0
    def _set_cluster_info(self, cluster):
        ambari_ip = plugin_utils.get_instance(
            cluster, p_common.AMBARI_SERVER).get_ip_or_dns_name()
        ambari_port = "8080"
        info = {
            p_common.AMBARI_SERVER: {
                "Web UI":
                "http://{host}:{port}".format(host=ambari_ip,
                                              port=ambari_port),
                "Username":
                "******",
                "Password":
                cluster.extra["ambari_password"]
            }
        }
        nns = plugin_utils.get_instances(cluster, p_common.NAMENODE)
        info[p_common.NAMENODE] = {}
        for idx, namenode in enumerate(nns):
            info[p_common.NAMENODE]["Web UI %s" % (idx + 1)] = (
                "http://%s:50070" % namenode.get_ip_or_dns_name())

        rms = plugin_utils.get_instances(cluster, p_common.RESOURCEMANAGER)
        info[p_common.RESOURCEMANAGER] = {}
        for idx, resourcemanager in enumerate(rms):
            info[p_common.RESOURCEMANAGER]["Web UI %s" % (idx + 1)] = (
                "http://%s:8088" % resourcemanager.get_ip_or_dns_name())

        historyserver = plugin_utils.get_instance(cluster,
                                                  p_common.HISTORYSERVER)
        if historyserver:
            info[p_common.HISTORYSERVER] = {
                "Web UI":
                "http://%s:19888" % historyserver.get_ip_or_dns_name()
            }
        atlserver = plugin_utils.get_instance(cluster,
                                              p_common.APP_TIMELINE_SERVER)
        if atlserver:
            info[p_common.APP_TIMELINE_SERVER] = {
                "Web UI": "http://%s:8188" % atlserver.get_ip_or_dns_name()
            }
        oozie = plugin_utils.get_instance(cluster, p_common.OOZIE_SERVER)
        if oozie:
            info[p_common.OOZIE_SERVER] = {
                "Web UI": "http://%s:11000/oozie" % oozie.get_ip_or_dns_name()
            }
        hbase_master = plugin_utils.get_instance(cluster,
                                                 p_common.HBASE_MASTER)
        if hbase_master:
            info[p_common.HBASE_MASTER] = {
                "Web UI":
                "http://%s:16010" % hbase_master.get_ip_or_dns_name()
            }
        falcon = plugin_utils.get_instance(cluster, p_common.FALCON_SERVER)
        if falcon:
            info[p_common.FALCON_SERVER] = {
                "Web UI": "http://%s:15000" % falcon.get_ip_or_dns_name()
            }
        storm_ui = plugin_utils.get_instance(cluster, p_common.STORM_UI_SERVER)
        if storm_ui:
            info[p_common.STORM_UI_SERVER] = {
                "Web UI": "http://%s:8744" % storm_ui.get_ip_or_dns_name()
            }
        ranger_admin = plugin_utils.get_instance(cluster,
                                                 p_common.RANGER_ADMIN)
        if ranger_admin:
            info[p_common.RANGER_ADMIN] = {
                "Web UI": "http://%s:6080" % ranger_admin.get_ip_or_dns_name(),
                "Username": "******",
                "Password": "******"
            }
        spark_hs = plugin_utils.get_instance(cluster,
                                             p_common.SPARK_JOBHISTORYSERVER)
        if spark_hs:
            info[p_common.SPARK_JOBHISTORYSERVER] = {
                "Web UI": "http://%s:18080" % spark_hs.get_ip_or_dns_name()
            }
        info.update(cluster.info.to_dict())
        ctx = context.ctx()
        conductor.cluster_update(ctx, cluster, {"info": info})
        cluster = conductor.cluster_get(ctx, cluster.id)
Example #42
0
    def test_job_execution_advanced_search(self):
        ctx = context.ctx()
        job = self.api.job_create(ctx, SAMPLE_JOB)
        ds_input = self.api.data_source_create(ctx, SAMPLE_DATA_SOURCE)
        SAMPLE_DATA_OUTPUT = copy.copy(SAMPLE_DATA_SOURCE)
        SAMPLE_DATA_OUTPUT['name'] = 'output'
        ds_output = self.api.data_source_create(ctx, SAMPLE_DATA_OUTPUT)

        # Create a cluster
        cl1 = self.api.cluster_create(ctx, test_clusters.SAMPLE_CLUSTER)

        # Create a second cluster with a different name
        cl2_vals = copy.copy(test_clusters.SAMPLE_CLUSTER)
        cl2_vals['name'] = 'test_cluster2'
        cl2 = self.api.cluster_create(ctx, cl2_vals)

        my_sample_job_exec = copy.copy(SAMPLE_JOB_EXECUTION)

        my_sample_job_exec['job_id'] = job['id']
        my_sample_job_exec['input_id'] = ds_input['id']
        my_sample_job_exec['output_id'] = ds_output['id']
        my_sample_job_exec['cluster_id'] = cl1['id']

        # Run job on cluster 1
        self.api.job_execution_create(ctx, my_sample_job_exec)

        # Run the same job on cluster 2 and set status
        my_sample_job_exec['cluster_id'] = cl2['id']
        my_sample_job_exec['info'] = {'status': 'KiLLeD'}
        self.api.job_execution_create(ctx, my_sample_job_exec)

        # Search only with job exeuction fields (finds both)
        lst = self.api.job_execution_get_all(ctx, **{'return_code': 1})
        self.assertEqual(2, len(lst))

        # Search on cluster name
        kwargs = {'cluster.name': cl1['name'], 'return_code': 1}
        lst = self.api.job_execution_get_all(ctx, **kwargs)
        self.assertEqual(1, len(lst))

        # Search on cluster name and job name
        kwargs = {
            'cluster.name': cl1['name'],
            'job.name': SAMPLE_JOB['name'],
            'return_code': 1
        }
        lst = self.api.job_execution_get_all(ctx, **kwargs)
        self.assertEqual(1, len(lst))

        # Search on cluster name, job name, and status
        kwargs = {
            'cluster.name': cl2['name'],
            'job.name': SAMPLE_JOB['name'],
            'status': 'killed',
            'return_code': 1
        }
        lst = self.api.job_execution_get_all(ctx, **kwargs)
        self.assertEqual(1, len(lst))

        # Search on job name (finds both)
        kwargs = {'job.name': SAMPLE_JOB['name'], 'return_code': 1}
        lst = self.api.job_execution_get_all(ctx, **kwargs)
        self.assertEqual(2, len(lst))

        # invalid cluster name value
        kwargs = {
            'cluster.name': cl1['name'] + 'foo',
            'job.name': SAMPLE_JOB['name']
        }
        lst = self.api.job_execution_get_all(ctx, **kwargs)
        self.assertEqual(0, len(lst))

        # invalid job name value
        kwargs = {
            'cluster.name': cl1['name'],
            'job.name': SAMPLE_JOB['name'] + 'foo'
        }
        lst = self.api.job_execution_get_all(ctx, **kwargs)
        self.assertEqual(0, len(lst))

        # invalid status value
        kwargs = {'cluster.name': cl1['name'], 'status': 'PENDING'}
        lst = self.api.job_execution_get_all(ctx, **kwargs)
        self.assertEqual(0, len(lst))
 def get_script_name(self, job):
     return conductor.job_main_name(context.ctx(), job)
Example #44
0
 def test_duplicate_job_binary_create(self):
     ctx = context.ctx()
     self.api.job_binary_create(ctx, SAMPLE_JOB_BINARY)
     with testtools.ExpectedException(ex.DBDuplicateEntry):
         self.api.job_binary_create(ctx, SAMPLE_JOB_BINARY)
Example #45
0
def get_job_binary_data(id):
    job_binary = conductor.job_binary_get(context.ctx(), id)
    return dispatch.get_raw_binary(job_binary, with_context=True)
Example #46
0
 def test_duplicate_data_source_create(self):
     ctx = context.ctx()
     self.api.data_source_create(ctx, SAMPLE_DATA_SOURCE)
     with testtools.ExpectedException(ex.DBDuplicateEntry):
         self.api.data_source_create(ctx, SAMPLE_DATA_SOURCE)
Example #47
0
def get_job_binary_internal_data(id):
    return conductor.job_binary_internal_get_raw_data(context.ctx(), id)
Example #48
0
    def _nullify_ng_counts(self, cluster):
        ctx = context.ctx()

        for node_group in cluster.node_groups:
            conductor.node_group_update(ctx, node_group, {"count": 0})
Example #49
0
def create_job(values):
    return conductor.job_create(context.ctx(), values)
Example #50
0
def update_job_binary_internal(id, values):
    return conductor.job_binary_internal_update(context.ctx(), id, values)
Example #51
0
def get_job_binary_internal(id):
    return conductor.job_binary_internal_get(context.ctx(), id)
Example #52
0
def delete_job_binary_internal(id):
    conductor.job_binary_internal_destroy(context.ctx(), id)
Example #53
0
def cancel_job_execution(id):
    context.set_current_job_execution_id(id)
    job_execution = conductor.job_execution_get(context.ctx(), id)
    OPS.cancel_job_execution(id)

    return job_execution
Example #54
0
def get_job_binary_internals(**kwargs):
    return conductor.job_binary_internal_get_all(context.ctx(),
                                                 regex_search=True,
                                                 **kwargs)
Example #55
0
def get_job(id):
    return conductor.job_get(context.ctx(), id)
Example #56
0
def delete_job(job_id):
    return conductor.job_destroy(context.ctx(), job_id)
Example #57
0
def delete_data_source(id):
    conductor.data_source_destroy(context.ctx(), id)
Example #58
0
def get_job_binary(id):
    return conductor.job_binary_get(context.ctx(), id)
Example #59
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)

        proxy_configs = job_execution.job_configs.get('proxy_configs')

        # We'll always run the driver program on the master
        master = plugin_utils.get_instance(self.cluster, "master")

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job,
                                               job_execution.id)
        paths = job_utils.upload_job_files(master,
                                           wf_dir,
                                           job,
                                           libs_subdir=False,
                                           proxy_configs=proxy_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        app_jar = paths.pop(0)

        # The rest of the paths will be passed with --jars
        additional_jars = ",".join(paths)
        if additional_jars:
            additional_jars = "--jars " + additional_jars

        # Launch the spark job using spark-submit and deploy_mode = client
        host = master.hostname()
        port = c_helper.get_config_value("Spark", "Master port", self.cluster)
        spark_submit = os.path.join(
            c_helper.get_config_value("Spark", "Spark home", self.cluster),
            "bin/spark-submit")

        job_class = job_execution.job_configs.configs["edp.java.main_class"]

        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        args = " ".join(job_execution.job_configs.get('args', []))

        # The redirects of stdout and stderr will preserve output in the wf_dir
        cmd = "%s %s --class %s %s --master spark://%s:%s %s" % (
            spark_submit, app_jar, job_class, additional_jars, host, port,
            args)

        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        with remote.get_remote(master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" %
                (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, {
                'spark-path': wf_dir
            })

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(
            _("Spark job execution failed. Exit status = "
              "%(status)s, stdout = %(stdout)s") % {
                  'status': ret,
                  'stdout': stdout
              })
Example #60
0
def get_jobs(**kwargs):
    return conductor.job_get_all(context.ctx(), regex_search=True, **kwargs)