Ejemplo n.º 1
0
    def terminate_unneeded_clusters(self, ctx):
        LOG.debug('Terminating unneeded clusters')
        ctx = context.get_admin_context()
        context.set_ctx(ctx)
        for cluster in conductor.cluster_get_all(ctx, status='Active'):
            if not cluster.is_transient:
                continue

            jc = conductor.job_execution_count(ctx,
                                               end_time=None,
                                               cluster_id=cluster.id)

            if jc > 0:
                continue

            cluster_updated_at = timeutils.normalize_time(
                timeutils.parse_isotime(cluster.updated_at))
            current_time = timeutils.utcnow()
            spacing = timeutils.delta_seconds(cluster_updated_at, current_time)
            if spacing < CONF.min_transient_cluster_active_time:
                continue

            if CONF.use_identity_api_v3:
                trusts.use_os_admin_auth_token(cluster)
                api.terminate_cluster(cluster.id)
                LOG.debug('Terminated cluster %s with id %s' %
                          (cluster.name, cluster.id))
            else:
                if cluster.status != 'AwaitingTermination':
                    conductor.cluster_update(
                        ctx,
                        cluster,
                        {'status': 'AwaitingTermination'})
        context.set_ctx(None)
Ejemplo n.º 2
0
        def _inner():
            if initial_delay:
                greenthread.sleep(initial_delay)

            try:
                while self._running:
                    start = timeutils.utcnow()
                    self.f(*self.args, **self.kw)
                    end = timeutils.utcnow()
                    if not self._running:
                        break
                    delay = interval - timeutils.delta_seconds(start, end)
                    if delay <= 0:
                        LOG.warn(_LW('task run outlasted interval by %s sec') %
                                 -delay)
                    greenthread.sleep(delay if delay > 0 else 0)
            except LoopingCallDone as e:
                self.stop()
                done.send(e.retvalue)
            except Exception:
                LOG.exception(_LE('in fixed duration looping call'))
                done.send_exception(*sys.exc_info())
                return
            else:
                done.send(True)
Ejemplo n.º 3
0
def _start_cloudera_manager(cluster):
    manager = pu.get_manager(cluster)
    with manager.remote() as r:
        cmd.start_cloudera_db(r)
        cmd.start_manager(r)

    timeout = 300
    LOG.debug("Waiting %(timeout)s seconds for Manager to start : " % {
        'timeout': timeout})
    s_time = timeutils.utcnow()
    while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
        try:
            conn = telnetlib.Telnet(manager.management_ip, CM_API_PORT)
            conn.close()
            break
        except IOError:
            context.sleep(2)
    else:
        message = _("Cloudera Manager failed to start in %(timeout)s minutes "
                    "on node '%(node)s' of cluster '%(cluster)s'") % {
                        'timeout': timeout / 60,
                        'node': manager.management_ip,
                        'cluster': cluster.name}
        raise ex.HadoopProvisionError(message)

    LOG.info(_LI("Cloudera Manager has been started"))
Ejemplo n.º 4
0
def _detach_volume(instance, volume_id):
    volume = cinder.get_volume(volume_id)
    try:
        LOG.debug("Detaching volume %s from instance %s" % (
            volume_id, instance.instance_name))
        nova.client().volumes.delete_server_volume(instance.instance_id,
                                                   volume_id)
    except Exception:
        LOG.exception(_LE("Can't detach volume %s"), volume.id)

    detach_timeout = CONF.detach_volume_timeout
    LOG.debug("Waiting %d seconds to detach %s volume" % (detach_timeout,
                                                          volume_id))
    s_time = tu.utcnow()
    while tu.delta_seconds(s_time, tu.utcnow()) < detach_timeout:
        volume = cinder.get_volume(volume_id)
        if volume.status not in ['available', 'error']:
            context.sleep(2)
        else:
            LOG.debug("Volume %s has been detached" % volume_id)
            return
    else:
        LOG.warn(_LW("Can't detach volume %(volume)s. "
                     "Current status of volume: %(status)s"),
                 {'volume': volume_id, 'status': volume.status})
Ejemplo n.º 5
0
    def transient_cluster_testing(self, plugin_config, floating_ip_pool,
                                  internal_neutron_net):
        cluster_template_id = self.create_cluster_template(
            name='test-transient-cluster-template-vanilla',
            plugin_config=self.vanilla_config,
            description=('test cluster template for transient cluster '
                         'of Vanilla plugin'),
            cluster_configs={},
            node_groups=[
                dict(
                    name='single-node',
                    flavor_id=self.flavor_id,
                    node_processes=['namenode'],
                    floating_ip_pool=floating_ip_pool,
                    count=1)
            ],
            net_id=internal_neutron_net
        )

        try:
            try:
                cluster_name = (self.common_config.CLUSTER_NAME + '-transient-'
                                + plugin_config.PLUGIN_NAME)
                self.create_cluster(
                    name=cluster_name,
                    plugin_config=plugin_config,
                    cluster_template_id=cluster_template_id,
                    description='test transient cluster',
                    cluster_configs={},
                    is_transient=True
                )
            except Exception:
                self.delete_objects(cluster_id=self.cluster_id)
                raise

            cluster_info = self.get_cluster_info(plugin_config)

            # set timeout in seconds
            timeout = self.common_config.TRANSIENT_CLUSTER_TIMEOUT * 60
            s_time = timeutils.utcnow()
            raise_failure = True
            while timeutils.delta_seconds(
                    s_time, timeutils.utcnow()) < timeout:
                try:
                    self.sahara.clusters.get(cluster_info['cluster_id'])
                except sab.APIException as api_ex:
                    if 'not found' in api_ex.message:
                        raise_failure = False
                        break
                time.sleep(2)

            if raise_failure:
                self.delete_objects(cluster_id=cluster_info['cluster_id'])
                self.fail('Transient cluster has not been deleted within %s '
                          'minutes.'
                          % self.common_config.TRANSIENT_CLUSTER_TIMEOUT)
        finally:
            self.delete_objects(cluster_template_id=cluster_template_id)
Ejemplo n.º 6
0
    def transient_cluster_testing(self, plugin_config, floating_ip_pool,
                                  internal_neutron_net):
        cluster_template_id = self.create_cluster_template(
            name='test-transient-cluster-template-vanilla',
            plugin_config=self.vanilla_config,
            description=('test cluster template for transient cluster '
                         'of Vanilla plugin'),
            cluster_configs={},
            node_groups=[
                dict(name='single-node',
                     flavor_id=self.flavor_id,
                     node_processes=['namenode'],
                     floating_ip_pool=floating_ip_pool,
                     count=1)
            ],
            net_id=internal_neutron_net)

        try:
            try:
                cluster_name = (self.common_config.CLUSTER_NAME +
                                '-transient-' + plugin_config.PLUGIN_NAME)
                self.create_cluster(name=cluster_name,
                                    plugin_config=plugin_config,
                                    cluster_template_id=cluster_template_id,
                                    description='test transient cluster',
                                    cluster_configs={},
                                    is_transient=True)
            except Exception:
                self.delete_objects(cluster_id=self.cluster_id)
                raise

            cluster_info = self.get_cluster_info(plugin_config)

            # set timeout in seconds
            timeout = self.common_config.TRANSIENT_CLUSTER_TIMEOUT * 60
            s_time = timeutils.utcnow()
            raise_failure = True
            while timeutils.delta_seconds(s_time,
                                          timeutils.utcnow()) < timeout:
                try:
                    self.sahara.clusters.get(cluster_info['cluster_id'])
                except sab.APIException as api_ex:
                    if 'not found' in api_ex.message:
                        raise_failure = False
                        break
                time.sleep(2)

            if raise_failure:
                self.delete_objects(cluster_id=cluster_info['cluster_id'])
                self.fail('Transient cluster has not been deleted within %s '
                          'minutes.' %
                          self.common_config.TRANSIENT_CLUSTER_TIMEOUT)
        finally:
            self.delete_objects(cluster_template_id=cluster_template_id)
Ejemplo n.º 7
0
def _check_decommission(cluster, instances, check_func, timeout):
    s_time = timeutils.utcnow()
    while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
        statuses = check_func(cluster)
        dec_ok = True
        for instance in instances:
            if statuses[instance.fqdn()] != 'decommissioned':
                dec_ok = False

        if dec_ok:
            return
        else:
            context.sleep(5)
    else:
        ex.SaharaException("Cannot finish decommission in %d seconds" %
                           timeout)
Ejemplo n.º 8
0
def _check_decommission(cluster, instances, check_func, timeout):
    s_time = timeutils.utcnow()
    while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
        statuses = check_func(cluster)
        dec_ok = True
        for instance in instances:
            if statuses[instance.fqdn()] != 'decommissioned':
                dec_ok = False

        if dec_ok:
            return
        else:
            context.sleep(5)
    else:
        ex.DecommissionError(
            "Cannot finish decommission of cluster %s in %d seconds" %
            (cluster, timeout))
Ejemplo n.º 9
0
    def terminate_unneeded_clusters(self, ctx):
        LOG.debug('Terminating unneeded transient clusters')
        ctx = context.get_admin_context()
        context.set_ctx(ctx)
        for cluster in conductor.cluster_get_all(ctx, status='Active'):
            if not cluster.is_transient:
                continue

            jc = conductor.job_execution_count(ctx,
                                               end_time=None,
                                               cluster_id=cluster.id)

            if jc > 0:
                continue

            cluster_updated_at = timeutils.normalize_time(
                timeutils.parse_isotime(cluster.updated_at))
            current_time = timeutils.utcnow()
            spacing = timeutils.delta_seconds(cluster_updated_at, current_time)
            if spacing < CONF.min_transient_cluster_active_time:
                continue

            if CONF.use_identity_api_v3:
                trusts.use_os_admin_auth_token(cluster)

                LOG.info(_LI('Terminating transient cluster %(cluster)s '
                             'with id %(id)s'),
                         {'cluster': cluster.name, 'id': cluster.id})

                try:
                    api.terminate_cluster(cluster.id)
                except Exception as e:
                    LOG.info(_LI('Failed to terminate transient cluster '
                             '%(cluster)s with id %(id)s: %(error)s.'),
                             {'cluster': cluster.name,
                              'id': cluster.id,
                              'error': six.text_type(e)})

            else:
                if cluster.status != 'AwaitingTermination':
                    conductor.cluster_update(
                        ctx,
                        cluster,
                        {'status': 'AwaitingTermination'})
        context.set_ctx(None)
Ejemplo n.º 10
0
def _await_agents(instances):
    api = cu.get_api_client(instances[0].node_group.cluster)
    timeout = 300
    LOG.debug("Waiting %(timeout)s seconds for agent connected to manager" % {
        'timeout': timeout})
    s_time = timeutils.utcnow()
    while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
        hostnames = [i.fqdn() for i in instances]
        hostnames_to_manager = [h.hostname for h in api.get_all_hosts('full')]
        is_ok = True
        for hostname in hostnames:
            if hostname not in hostnames_to_manager:
                is_ok = False
                break

        if not is_ok:
            context.sleep(5)
        else:
            break
    else:
        raise ex.HadoopProvisionError(_("Cloudera agents failed to connect to"
                                        " Cloudera Manager"))
Ejemplo n.º 11
0
def decommission_dn(nn, inst_to_be_deleted, survived_inst):
    with remote.get_remote(nn) as r:
        r.write_file_to('/etc/hadoop/dn.excl',
                        utils.generate_fqdn_host_names(inst_to_be_deleted))
        run.refresh_nodes(remote.get_remote(nn), "dfsadmin")
        context.sleep(3)

        timeout = config_helper.get_decommissioning_timeout(
            nn.node_group.cluster)
        s_time = timeutils.utcnow()
        all_found = False

        while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
            cmd = r.execute_command(
                "sudo su -c 'hadoop dfsadmin -report' hadoop")
            all_found = True
            datanodes_info = parse_dfs_report(cmd[1])
            for i in inst_to_be_deleted:
                for dn in datanodes_info:
                    if (dn["Name"].startswith(i.internal_ip)) and (
                            dn["Decommission Status"] != "Decommissioned"):
                        all_found = False
                        break

            if all_found:
                r.write_files_to({
                    '/etc/hadoop/dn.incl':
                    utils.generate_fqdn_host_names(survived_inst),
                    '/etc/hadoop/dn.excl':
                    "",
                })
                break
            context.sleep(3)

        if not all_found:
            ex.DecommissionError(
                "Cannot finish decommission of cluster %s in %d seconds" %
                (nn.node_group.cluster, timeout))
Ejemplo n.º 12
0
def decommission_dn(nn, inst_to_be_deleted, survived_inst):
    with remote.get_remote(nn) as r:
        r.write_file_to('/etc/hadoop/dn.excl',
                        utils.generate_fqdn_host_names(
                            inst_to_be_deleted))
        run.refresh_nodes(remote.get_remote(nn), "dfsadmin")
        context.sleep(3)

        timeout = c_helper.get_decommissioning_timeout(
            nn.node_group.cluster)
        s_time = timeutils.utcnow()
        all_found = False

        while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
            cmd = r.execute_command(
                "sudo -u hdfs hadoop dfsadmin -report")
            all_found = True
            datanodes_info = parse_dfs_report(cmd[1])
            for i in inst_to_be_deleted:
                for dn in datanodes_info:
                    if (dn["Name"].startswith(i.internal_ip)) and (
                            dn["Decommission Status"] != "Decommissioned"):
                        all_found = False
                        break

            if all_found:
                r.write_files_to({'/etc/hadoop/dn.incl':
                                 utils.
                                 generate_fqdn_host_names(survived_inst),
                                  '/etc/hadoop/dn.excl': "",
                                  })
                break
            context.sleep(3)

        if not all_found:
            ex.DecommissionError(
                "Cannot finish decommission of cluster %s in %d seconds" %
                (nn.node_group.cluster, timeout))
Ejemplo n.º 13
0
    def transient_cluster_testing(self, plugin_config, floating_ip_pool,
                                  internal_neutron_net):
        cluster_template_id = self.create_cluster_template(
            name='test-transient-cluster-template-vanilla',
            plugin_config=self.vanilla_config,
            description=('test cluster template for transient cluster '
                         'of Vanilla plugin'),
            cluster_configs={},
            node_groups=[
                dict(
                    name='master-node',
                    flavor_id=self.flavor_id,
                    node_processes=['namenode', 'oozie', 'jobtracker'],
                    floating_ip_pool=floating_ip_pool,
                    count=1),
                dict(
                    name='worker-node',
                    flavor_id=self.flavor_id,
                    node_processes=['datanode', 'tasktracker'],
                    floating_ip_pool=floating_ip_pool,
                    count=1)
            ],
            net_id=internal_neutron_net
        )

        try:
            # create a transient cluster
            try:
                cluster_name = (self.common_config.CLUSTER_NAME + '-transient-'
                                + plugin_config.PLUGIN_NAME)
                self.create_cluster(
                    name=cluster_name,
                    plugin_config=plugin_config,
                    cluster_template_id=cluster_template_id,
                    description='test transient cluster',
                    cluster_configs={},
                    is_transient=True
                )
            except Exception:
                self.delete_objects(cluster_id=self.cluster_id)
                raise

            # check EDP
            path = 'sahara/tests/integration/tests/resources/'
            pig_job_data = open(path + 'edp-job.pig').read()
            pig_lib_data = open(path + 'edp-lib.jar').read()
            self.edp_testing(job_type=utils_edp.JOB_TYPE_PIG,
                             job_data_list=[{'pig': pig_job_data}],
                             lib_data_list=[{'jar': pig_lib_data}])

            # set timeout in seconds
            timeout = self.common_config.TRANSIENT_CLUSTER_TIMEOUT * 60
            s_time = timeutils.utcnow()
            raise_failure = True
            # wait for cluster deleting
            while timeutils.delta_seconds(
                    s_time, timeutils.utcnow()) < timeout:
                try:
                    self.sahara.clusters.get(self.cluster_id)
                except sab.APIException as api_ex:
                    if 'not found' in api_ex.message:
                        raise_failure = False
                        break
                time.sleep(2)

            if raise_failure:
                self.delete_objects(cluster_id=self.cluster_id)
                self.fail('Transient cluster has not been deleted within %s '
                          'minutes.'
                          % self.common_config.TRANSIENT_CLUSTER_TIMEOUT)
        finally:
            self.delete_objects(cluster_template_id=cluster_template_id)