Example #1
0
    def _await_networks(self, cluster, instances):
        if not instances:
            return

        ips_assigned = set()
        while len(ips_assigned) != len(instances):
            if not g.check_cluster_exists(instances[0].node_group.cluster):
                return
            for instance in instances:
                if instance.id not in ips_assigned:
                    if networks.init_instances_ips(instance):
                        ips_assigned.add(instance.id)

            context.sleep(1)

        LOG.info("Cluster '%s': all instances have IPs assigned" % cluster.id)

        ctx = context.ctx()
        cluster = conductor.cluster_get(ctx, instances[0].node_group.cluster)
        instances = g.get_instances(cluster, ips_assigned)

        with context.ThreadGroup() as tg:
            for instance in instances:
                tg.spawn("wait-for-ssh-%s" % instance.instance_name,
                         self._wait_until_accessible, instance)

        LOG.info("Cluster '%s': all instances are accessible" % cluster.id)
Example #2
0
def _configure_instances(cluster):
    """Configure active instances.

    * generate /etc/hosts
    * setup passwordless login
    * etc.
    """
    hosts = _generate_etc_hosts(cluster)
    for node_group in cluster.node_groups:
        for instance in node_group.instances:
            with remote.get_remote(instance) as r:
                r.write_file_to('etc-hosts', hosts)
                r.execute_command('sudo mv etc-hosts /etc/hosts')

                # wait generate id_rsa key
                timeout = 10
                cur_time = 0
                while cur_time < timeout:
                    code, _ = r.execute_command('ls .ssh/id_rsa',
                                                raise_when_error=False)
                    if code:
                        cur_time += 1
                        context.sleep(1)
                    else:
                        break
                else:
                    raise RuntimeError("Error getting user private key")

                r.execute_command('sudo chown $USER:$USER .ssh/id_rsa')
                r.execute_command('chmod 400 .ssh/id_rsa')
Example #3
0
    def _wait_for_host_registrations(self, num_hosts, ambari_host):
        LOG.info('Waiting for all Ambari agents to register with server ...')

        url = 'http://{0}:8080/api/v1/hosts'.format(ambari_host.management_ip)
        result = None
        json_result = None

        #TODO(jspeidel): timeout
        while result is None or len(json_result['items']) < num_hosts:
            context.sleep(5)
            try:
                result = requests.get(url,
                                      auth=(self.ambari_user,
                                            self.ambari_password))
                json_result = json.loads(result.text)

                # TODO(jspeidel): just for debug
                LOG.info('Registered Hosts: {0} of {1}'.format(
                    len(json_result['items']), num_hosts))
                for hosts in json_result['items']:
                    LOG.debug('Registered Host: {0}'.format(
                        hosts['Hosts']['host_name']))
            except requests.ConnectionError:
                #TODO(jspeidel): max wait time
                LOG.info('Waiting to connect to ambari server ...')
Example #4
0
def decommission_dn(nn, inst_to_be_deleted, survived_inst):
    with remote.get_remote(nn) as r:
        r.write_file_to('/etc/hadoop/dn.excl',
                        utils.generate_fqdn_host_names(inst_to_be_deleted))
        run.refresh_nodes(remote.get_remote(nn), "dfsadmin")
        context.sleep(3)

        att_amount = 10
        while att_amount:
            cmd = r.execute_command(
                "sudo su -c 'hadoop dfsadmin -report' hadoop")
            all_found = True
            datanodes_info = parse_dfs_report(cmd[1])
            for i in inst_to_be_deleted:
                for dn in datanodes_info:
                    if (dn["Name"].startswith(i.internal_ip)) and (
                            dn["Decommission Status"] != "Decommissioned"):
                        all_found = False
                        break

            if all_found:
                r.write_files_to({
                    '/etc/hadoop/dn.incl':
                    utils.generate_fqdn_host_names(survived_inst),
                    '/etc/hadoop/dn.excl':
                    "",
                })
                break
            context.sleep(3)
            att_amount -= 1

        if not att_amount:
            raise Exception("Cannot finish decommission")
Example #5
0
def _await_networks(cluster, instances):
    if not instances:
        return

    ips_assigned = set()
    while len(ips_assigned) != len(instances):
        if not g.check_cluster_exists(instances[0].node_group.cluster):
            return
        for instance in instances:
            if instance.id not in ips_assigned:
                if networks.init_instances_ips(instance):
                    ips_assigned.add(instance.id)

        context.sleep(1)

    LOG.info("Cluster '%s': all instances have IPs assigned" % cluster.id)

    ctx = context.ctx()
    cluster = conductor.cluster_get(ctx, instances[0].node_group.cluster)
    instances = _get_instances(cluster, ips_assigned)

    with context.ThreadGroup() as tg:
        for instance in instances:
            tg.spawn("wait-for-ssh-%s" % instance.instance_name,
                     _wait_until_accessible, instance)

    LOG.info("Cluster '%s': all instances are accessible" % cluster.id)
Example #6
0
    def _wait_for_host_registrations(self, num_hosts, ambari_info):
        LOG.info(
            'Waiting for all Ambari agents to register with server ...')

        url = 'http://{0}/api/v1/hosts'.format(ambari_info.get_address())
        result = None
        json_result = None

        #TODO(jspeidel): timeout
        while result is None or len(json_result['items']) < num_hosts:
            context.sleep(5)
            try:
                result = requests.get(url, auth=(ambari_info.user,
                                                 ambari_info.password))
                json_result = json.loads(result.text)

                # TODO(jspeidel): just for debug
                LOG.info('Registered Hosts: {0} of {1}'.format(
                    len(json_result['items']), num_hosts))
                for hosts in json_result['items']:
                    LOG.debug('Registered Host: {0}'.format(
                        hosts['Hosts']['host_name']))
            except requests.ConnectionError:
                #TODO(jspeidel): max wait time
                LOG.info('Waiting to connect to ambari server ...')
Example #7
0
def _await_networks(instances):
    if not instances:
        return

    ips_assigned = set()
    while len(ips_assigned) != len(instances):
        if not _check_cluster_exists(instances[0].node_group.cluster):
            return
        for instance in instances:
            if instance.id not in ips_assigned:
                if networks.init_instances_ips(instance):
                    ips_assigned.add(instance.id)

        context.sleep(1)

    ctx = context.ctx()
    cluster = conductor.cluster_get(ctx, instances[0].node_group.cluster)
    instances = get_instances(cluster, ips_assigned)

    accessible_instances = set()
    while len(accessible_instances) != len(instances):
        if not _check_cluster_exists(instances[0].node_group.cluster):
            return
        for instance in instances:
            if instance.id not in accessible_instances:
                if _check_if_accessible(instance):
                    accessible_instances.add(instance.id)

        context.sleep(1)
Example #8
0
def decommission_dn(nn, inst_to_be_deleted, survived_inst):
    with remote.get_remote(nn) as r:
        r.write_file_to("/etc/hadoop/dn.excl", utils.generate_fqdn_host_names(inst_to_be_deleted))
        run.refresh_nodes(remote.get_remote(nn), "dfsadmin")
        context.sleep(3)

        att_amount = 100
        while att_amount:
            cmd = r.execute_command("sudo su -c 'hadoop dfsadmin -report' hadoop")
            all_found = True
            datanodes_info = parse_dfs_report(cmd[1])
            for i in inst_to_be_deleted:
                for dn in datanodes_info:
                    if (dn["Name"].startswith(i.internal_ip)) and (dn["Decommission Status"] != "Decommissioned"):
                        all_found = False
                        break

            if all_found:
                r.write_files_to(
                    {"/etc/hadoop/dn.incl": utils.generate_fqdn_host_names(survived_inst), "/etc/hadoop/dn.excl": ""}
                )
                break
            context.sleep(3)
            att_amount -= 1

        if not att_amount:
            raise Exception("Cannot finish decommission")
Example #9
0
def decommission_tt(jt, inst_to_be_deleted, survived_inst):
    with remote.get_remote(jt) as r:
        r.write_file_to("/etc/hadoop/tt.excl", utils.generate_fqdn_host_names(inst_to_be_deleted))
        run.refresh_nodes(remote.get_remote(jt), "mradmin")
        context.sleep(3)
        r.write_files_to(
            {"/etc/hadoop/tt.incl": utils.generate_fqdn_host_names(survived_inst), "/etc/hadoop/tt.excl": ""}
        )
Example #10
0
    def wait_till_active(self):
        while self.heat_stack.stack_status in ('CREATE_IN_PROGRESS',
                                               'UPDATE_IN_PROGRESS'):
            context.sleep(1)
            self.heat_stack.get()

        if self.heat_stack.stack_status not in ('CREATE_COMPLETE',
                                                'UPDATE_COMPLETE'):
            raise ex.HeatStackException(self.heat_stack.stack_status)
Example #11
0
def _await_instances(cluster):
    """Await all instances are in Active status and available."""
    all_up = False
    while not all_up:
        all_up = True
        for node_group in cluster.node_groups:
            for instance in node_group.instances:
                if not _check_if_up(instance):
                    all_up = False
        context.sleep(1)
Example #12
0
def _await_attach_volume(instance, device_path):
    timeout = 10
    for _ in six.moves.xrange(timeout):
        device_paths = _get_device_paths(instance)
        if device_path in device_paths:
            return
        else:
            context.sleep(1)

    raise RuntimeError("Error attach volume to instance %s" %
                       instance.instance_name)
Example #13
0
def decommission_tt(jt, inst_to_be_deleted, survived_inst):
    with jt.remote as r:
        r.write_file_to('/etc/hadoop/tt.excl',
                        utils.generate_fqdn_host_names(
                            inst_to_be_deleted))
        run.refresh_nodes(jt.remote, "mradmin")
        context.sleep(3)
        r.write_files_to({'/etc/hadoop/tt.incl':
                         utils.generate_fqdn_host_names(survived_inst),
                         '/etc/hadoop/tt.excl': "",
                          })
Example #14
0
def _await_attach_volume(instance, device_path):
    timeout = 10
    for _ in six.moves.xrange(timeout):
        device_paths = _get_device_paths(instance)
        if device_path in device_paths:
            return
        else:
            context.sleep(1)

    raise RuntimeError("Error attach volume to instance %s" %
                       instance.instance_name)
Example #15
0
def _await_attach_volumes(instance, count_volumes):
    timeout = 10
    step = 2
    while timeout > 0:
        if len(_get_unmounted_devices(instance)) == count_volumes:
            return

        timeout -= step
        context.sleep(step)

    raise RuntimeError("Error attach volume to instance %s" %
                       instance.instance_name)
Example #16
0
def _await_attach_volumes(instance, count_volumes):
    timeout = 10
    step = 2
    while timeout > 0:
        if len(_get_unmounted_devices(instance)) == count_volumes:
            return

        timeout -= step
        context.sleep(step)

    raise RuntimeError("Error attach volume to instance %s" %
                       instance.instance_name)
Example #17
0
def decommission_tt(jt, inst_to_be_deleted, survived_inst):
    with remote.get_remote(jt) as r:
        r.write_file_to('/etc/hadoop/tt.excl',
                        utils.generate_fqdn_host_names(inst_to_be_deleted))
        run.refresh_nodes(remote.get_remote(jt), "mradmin")
        context.sleep(3)
        r.write_files_to({
            '/etc/hadoop/tt.incl':
            utils.generate_fqdn_host_names(survived_inst),
            '/etc/hadoop/tt.excl':
            "",
        })
Example #18
0
def _await_active(instances):
    """Await all instances are in Active status and available."""
    if not instances:
        return

    active_ids = set()
    while len(active_ids) != len(instances):
        if not _check_cluster_exists(instances[0].node_group.cluster):
            return
        for instance in instances:
            if instance.id not in active_ids:
                if _check_if_active(instance):
                    active_ids.add(instance.id)

        context.sleep(1)
Example #19
0
def _create_attach_volume(instance, size, device_path, display_name=None,
                          volume_type=None):
    volume = cinder.client().volumes.create(size=size,
                                            display_name=display_name,
                                            volume_type=volume_type)
    instance.volumes.append(volume.id)

    while volume.status != 'available':
        volume = cinder.get_volume(volume.id)
        if volume.status == 'error':
            raise RuntimeError("Volume %s has error status" % volume.id)

        context.sleep(1)

    nova.client().volumes.create_server_volume(instance.instance_id,
                                               volume.id, device_path)
Example #20
0
def _create_attach_volume(ctx, instance, size, display_name=None,
                          volume_type=None):
    volume = cinder.client().volumes.create(size=size,
                                            display_name=display_name,
                                            volume_type=volume_type)
    conductor.append_volume(ctx, instance, volume.id)

    while volume.status != 'available':
        volume = cinder.get_volume(volume.id)
        if volume.status == 'error':
            raise RuntimeError("Volume %s has error status" % volume.id)

        context.sleep(1)

    nova.client().volumes.create_server_volume(instance.instance_id,
                                               volume.id, None)
Example #21
0
def _wait_until_accessible(instance):
    while True:
        try:
            # check if ssh is accessible and cloud-init
            # script is finished generating authorized_keys
            exit_code, stdout = instance.remote().execute_command("ls .ssh/authorized_keys", raise_when_error=False)

            if exit_code == 0:
                LOG.debug("Instance %s is accessible" % instance.instance_name)
                return
        except Exception as ex:
            LOG.debug("Can't login to node %s (%s), reason %s", instance.instance_name, instance.management_ip, ex)

        context.sleep(5)

        if not g.check_cluster_exists(instance.node_group.cluster):
            return
Example #22
0
    def _await_active(self, cluster, instances):
        """Await all instances are in Active status and available."""
        if not instances:
            return

        active_ids = set()
        while len(active_ids) != len(instances):
            if not g.check_cluster_exists(instances[0].node_group.cluster):
                return
            for instance in instances:
                if instance.id not in active_ids:
                    if self._check_if_active(instance):
                        active_ids.add(instance.id)

            context.sleep(1)

        LOG.info("Cluster '%s': all instances are active" % cluster.id)
Example #23
0
def run_in_subprocess(proc, func, args=(), kwargs={}):
    try:
        pickle.dump(func, proc.stdin)
        pickle.dump(args, proc.stdin)
        pickle.dump(kwargs, proc.stdin)
        proc.stdin.flush()

        result = pickle.load(proc.stdout)

        if 'exception' in result:
            raise SubprocessException(result['exception'])

        return result['output']
    finally:
        # NOTE(dmitryme): in openstack/common/processutils.py it
        # is suggested to sleep a little between calls to multiprocessing.
        # That should allow it make some necessary cleanup
        context.sleep(0)
Example #24
0
def run_in_subprocess(proc, func, args=(), kwargs={}):
    try:
        pickle.dump(func, proc.stdin)
        pickle.dump(args, proc.stdin)
        pickle.dump(kwargs, proc.stdin)
        proc.stdin.flush()

        result = pickle.load(proc.stdout)

        if 'exception' in result:
            raise SubprocessException(result['exception'])

        return result['output']
    finally:
        # NOTE(dmitryme): in openstack/common/processutils.py it
        # is suggested to sleep a little between calls to multiprocessing.
        # That should allow it make some necessary cleanup
        context.sleep(0)
Example #25
0
    def _wait_for_async_request(self, request_url, ambari_info):
        started = False
        while not started:
            result = self._get(request_url, ambari_info)
            LOG.debug(
                'async request ' + request_url + ' response:\n' + result.text)
            json_result = json.loads(result.text)
            started = True
            for items in json_result['items']:
                status = items['Tasks']['status']
                if status == 'FAILED' or status == 'ABORTED':
                    return False
                else:
                    if status != 'COMPLETED':
                        started = False

            context.sleep(5)
        return started
Example #26
0
    def _wait_for_async_request(self, request_url, auth):
        started = False
        while not started:
            result = requests.get(request_url, auth=auth)
            LOG.debug(
                'async request ' + request_url + ' response:\n' + result.text)
            json_result = json.loads(result.text)
            started = True
            for items in json_result['items']:
                status = items['Tasks']['status']
                if status == 'FAILED' or status == 'ABORTED':
                    return False
                else:
                    if status != 'COMPLETED':
                        started = False

            context.sleep(5)
        return started
Example #27
0
    def start(self):
        url = ('/cluster/%s/services/%s/commands/start' %
               (self.cluster_name, self.service))

        self.rest.post(url)

        #TODO(alazarev) make timeout configurable (bug #1262897)
        timeout = 600
        cur_time = 0
        while cur_time < timeout:
            context.sleep(2)
            if self.status() == 'running':
                break
            else:
                cur_time += 2
        else:
            raise iex.IntelPluginException(
                "Service '%s' has failed to start in %s seconds" %
                (self.service, timeout))
Example #28
0
    def start(self):
        url = ('/cluster/%s/services/%s/commands/start'
               % (self.cluster_name, self.service))

        self.rest.post(url)

        #TODO(alazarev) make timeout configurable (bug #1262897)
        timeout = 600
        cur_time = 0
        while cur_time < timeout:
            context.sleep(2)
            if self.status() == 'running':
                break
            else:
                cur_time += 2
        else:
            raise iex.IntelPluginException(
                "Service '%s' has failed to start in %s seconds"
                % (self.service, timeout))
Example #29
0
def _wait_until_accessible(instance):
    while True:
        try:
            # check if ssh is accessible and cloud-init
            # script is finished generating id_rsa
            exit_code, stdout = instance.remote.execute_command(
                "ls .ssh/id_rsa", raise_when_error=False)

            if exit_code == 0:
                LOG.debug('Instance %s is accessible' % instance.instance_name)
                return
        except Exception as ex:
            LOG.debug("Can't login to node %s (%s), reason %s",
                      instance.instance_name, instance.management_ip, ex)

        context.sleep(5)

        if not _check_cluster_exists(instance.node_group.cluster):
            return
Example #30
0
    def _wait_for_async_request(self, request_id, cluster_name, ambari_host):
        request_url = "http://{0}:8080/api/v1/clusters/{1}/requests/{" "2}/tasks?fields=Tasks/status".format(
            ambari_host.management_ip, cluster_name, request_id
        )
        started = False
        while not started:
            result = requests.get(request_url, auth=(self.ambari_user, self.ambari_password))
            LOG.debug("async request " + request_url + " response:\n" + result.text)
            json_result = json.loads(result.text)
            started = True
            for items in json_result["items"]:
                status = items["Tasks"]["status"]
                if status == "FAILED" or status == "ABORTED":
                    return False
                else:
                    if status != "COMPLETED":
                        started = False

            context.sleep(5)
        return started
Example #31
0
    def _await_datanodes(self, cluster):
        datanodes_count = len(utils.get_datanodes(cluster))
        if datanodes_count < 1:
            return

        LOG.info("Waiting %s datanodes to start up" % datanodes_count)
        with remote.get_remote(utils.get_namenode(cluster)) as r:
            while True:
                if run.check_datanodes_count(r, datanodes_count):
                    LOG.info('Datanodes on cluster %s has been started' %
                             cluster.name)
                    return

                context.sleep(1)

                if not g.check_cluster_exists(cluster):
                    LOG.info(
                        'Stop waiting datanodes on cluster %s since it has '
                        'been deleted' % cluster.name)
                    return
Example #32
0
    def _await_datanodes(self, cluster):
        datanodes_count = len(utils.get_datanodes(cluster))
        if datanodes_count < 1:
            return

        LOG.info("Waiting %s datanodes to start up" % datanodes_count)
        with remote.get_remote(utils.get_namenode(cluster)) as r:
            while True:
                if run.check_datanodes_count(r, datanodes_count):
                    LOG.info(
                        'Datanodes on cluster %s has been started' %
                        cluster.name)
                    return

                context.sleep(1)

                if not g.check_cluster_exists(cluster):
                    LOG.info(
                        'Stop waiting datanodes on cluster %s since it has '
                        'been deleted' % cluster.name)
                    return
Example #33
0
    def _wait_for_host_registrations(self, num_hosts, ambari_host):
        LOG.info("Waiting for all Ambari agents to register with server ...")

        url = "http://{0}:8080/api/v1/hosts".format(ambari_host.management_ip)
        result = None
        json_result = None

        # TODO(jspeidel): timeout
        while result is None or len(json_result["items"]) < num_hosts:
            context.sleep(5)
            try:
                result = requests.get(url, auth=(self.ambari_user, self.ambari_password))
                json_result = json.loads(result.text)

                # TODO(jspeidel): just for debug
                LOG.info("Registered Hosts: {0} of {1}".format(len(json_result["items"]), num_hosts))
                for hosts in json_result["items"]:
                    LOG.debug("Registered Host: {0}".format(hosts["Hosts"]["host_name"]))
            except requests.ConnectionError:
                # TODO(jspeidel): max wait time
                LOG.info("Waiting to connect to ambari server ...")
Example #34
0
    def _wait_for_async_request(self, request_id, cluster_name, ambari_host):
        request_url = 'http://{0}:8080/api/v1/clusters/{1}/requests/{' \
                      '2}/tasks?fields=Tasks/status'.format(
                          ambari_host.management_ip, cluster_name, request_id)
        started = False
        while not started:
            result = requests.get(request_url, auth=('admin', 'admin'))
            LOG.debug(
                'async request ' + request_url + ' response:\n' + result.text)
            json_result = json.loads(result.text)
            started = True
            for items in json_result['items']:
                status = items['Tasks']['status']
                if status == 'FAILED' or status == 'ABORTED':
                    return False
                else:
                    if status != 'COMPLETED':
                        started = False

            context.sleep(5)
        return started
Example #35
0
def wait(ctx, session_id):
    #TODO(lazarev) add check on savanna cluster state (exit on delete)
    #TODO(alazarev) make configurable (bug #1262897)
    timeout = 4 * 60 * 60  # 4 hours
    cur_time = 0
    while cur_time < timeout:
        info_items = get(ctx, session_id)['items']
        for item in info_items:
            progress = item['nodeprogress']
            if progress['info'].strip() == '_ALLFINISH':
                return
            else:
                context.sleep(10)
                cur_time += 10

            debug_msg = 'Hostname: %s\nInfo: %s'
            debug_msg = debug_msg % (progress['hostname'], progress['info'])
            LOG.debug(debug_msg)
    else:
        raise iex.IntelPluginException(
            "Cluster '%s' has failed to start in %s minutes" %
            (ctx.cluster_name, timeout / 60))
Example #36
0
    def wait_for_host_registrations(self, num_hosts, ambari_info):
        LOG.info('Waiting for all Ambari agents to register with server ...')

        url = 'http://{0}/api/v1/hosts'.format(ambari_info.get_address())
        result = None
        json_result = None

        #TODO(jspeidel): timeout
        while result is None or len(json_result['items']) < num_hosts:
            context.sleep(5)
            try:
                result = self._get(url, ambari_info)
                json_result = json.loads(result.text)

                LOG.info('Registered Hosts: {0} of {1}'.format(
                    len(json_result['items']), num_hosts))
                for hosts in json_result['items']:
                    LOG.debug('Registered Host: {0}'.format(
                        hosts['Hosts']['host_name']))
            except requests.ConnectionError:
                #TODO(jspeidel): max wait time
                LOG.info('Waiting to connect to ambari server ...')
Example #37
0
def wait(ctx, session_id):
    #TODO(lazarev) add check on savanna cluster state (exit on delete)
    #TODO(alazarev) make configurable (bug #1262897)
    timeout = 4*60*60  # 4 hours
    cur_time = 0
    while cur_time < timeout:
        info_items = get(ctx, session_id)['items']
        for item in info_items:
            progress = item['nodeprogress']
            if progress['info'].strip() == '_ALLFINISH':
                return
            else:
                context.sleep(10)
                cur_time += 10

            debug_msg = 'Hostname: %s\nInfo: %s'
            debug_msg = debug_msg % (progress['hostname'], progress['info'])
            LOG.debug(debug_msg)
    else:
        raise iex.IntelPluginException(
            "Cluster '%s' has failed to start in %s minutes"
            % (ctx.cluster_name, timeout / 60))
Example #38
0
def _await_instances(cluster):
    """Await all instances are in Active status and available."""
    ctx = context.ctx()
    all_up = False
    is_accesible = set()
    while not all_up:
        all_up = True

        for node_group in cluster.node_groups:
            for instance in node_group.instances:
                if not _check_if_up(instance):
                    all_up = False

        cluster = conductor.cluster_get(ctx, cluster)

        for node_group in cluster.node_groups:
            for instance in node_group.instances:
                if not _check_if_accessible(instance, is_accesible):
                    all_up = False

        context.sleep(1)

    return cluster
Example #39
0
def _await_instances(cluster):
    """Await all instances are in Active status and available."""
    ctx = context.ctx()
    all_up = False
    is_accesible = set()
    while not all_up:
        all_up = True

        for node_group in cluster.node_groups:
            for instance in node_group.instances:
                if not _check_if_up(instance):
                    all_up = False

        cluster = conductor.cluster_get(ctx, cluster)

        for node_group in cluster.node_groups:
            for instance in node_group.instances:
                if not _check_if_accessible(instance, is_accesible):
                    all_up = False

        context.sleep(1)

    return cluster
Example #40
0
    def _wait_for_async_request(self, request_id, cluster_name, ambari_host):
        request_url = 'http://{0}:8080/api/v1/clusters/{1}/requests/{' \
                      '2}/tasks?fields=Tasks/status'.format(
                          ambari_host.management_ip, cluster_name, request_id)
        started = False
        while not started:
            result = requests.get(request_url,
                                  auth=(self.ambari_user,
                                        self.ambari_password))
            LOG.debug('async request ' + request_url + ' response:\n' +
                      result.text)
            json_result = json.loads(result.text)
            started = True
            for items in json_result['items']:
                status = items['Tasks']['status']
                if status == 'FAILED' or status == 'ABORTED':
                    return False
                else:
                    if status != 'COMPLETED':
                        started = False

            context.sleep(5)
        return started
Example #41
0
def decommission_nodes(cluster, instances):
    dec_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [dn.fqdn() for dn in u.get_tasktrackers(cluster)]

    mng_ip = u.get_instances(cluster, 'manager')[0].management_ip
    client = c.IntelClient(mng_ip, cluster.name)

    dec_dn_hosts = []
    for dec_host in dec_hosts:
        if dec_host in dn_hosts:
            dec_dn_hosts.append(dec_host)

    if dec_dn_hosts:
        client.services.hdfs.decommission_nodes(dec_dn_hosts)

        #TODO(alazarev) make timeout configurable (bug #1262897)
        timeout = 14400  # 4 hours
        cur_time = 0
        for host in dec_dn_hosts:
            while cur_time < timeout:
                if client.services.hdfs.get_datanode_status(
                        host) == 'Decomissioned':
                    break
                context.sleep(5)
                cur_time += 5
            else:
                LOG.warn("Failed to decomission node '%s' of cluster '%s' "
                         "in %s minutes" % (host, cluster.name, timeout/60))

    client.nodes.stop(dec_hosts)

    # wait stop services
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600  # 10 minutes
    cur_time = 0
    for instance in instances:
        while cur_time < timeout:
            stopped = True
            if instance.fqdn() in dn_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-datanode status',
                    raise_when_error=False)
                if out.strip() != 'datanode is stopped':
                    stopped = False
                if out.strip() == 'datanode dead but pid file exists':
                    instance.remote().execute_command(
                        'sudo rm -f '
                        '/var/run/hadoop/hadoop-hadoop-datanode.pid')
            if instance.fqdn() in tt_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-tasktracker status',
                    raise_when_error=False)
                if out.strip() != 'tasktracker is stopped':
                    stopped = False
            if stopped:
                break
            else:
                context.sleep(5)
                cur_time += 5
        else:
            LOG.warn("Failed to stop services on node '%s' of cluster '%s' "
                     "in %s minutes" % (instance, cluster.name, timeout/60))

    for node in dec_hosts:
        LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name))
        client.nodes.delete(node)
Example #42
0
def install_manager(cluster):
    LOG.info("Starting Install Manager Process")
    mng_instance = u.get_instance(cluster, 'manager')

    idh_tarball_path = c_helper.get_config_value(
        cluster.cluster_configs.get('general'), c_helper.IDH_TARBALL_URL)

    idh_tarball_filename = idh_tarball_path.rsplit('/', 1)[-1]
    idh_dir = idh_tarball_filename[:idh_tarball_filename.find('.tar.gz')]
    LOG.info("IDH tgz will be retrieved from: \'%s\'", idh_tarball_path)

    idh_repo = c_helper.get_config_value(
        cluster.cluster_configs.get('general'), c_helper.IDH_REPO_URL)

    os_repo = c_helper.get_config_value(
        cluster.cluster_configs.get('general'), c_helper.OS_REPO_URL)

    idh_install_cmd = 'sudo ./%s/install.sh --mode=silent 2>&1' % idh_dir

    with mng_instance.remote() as r:
        LOG.info("Download IDH manager ")
        try:
            r.execute_command('curl -O %s 2>&1' % idh_tarball_path)
        except Exception as e:
            raise RuntimeError("Unable to download IDH manager from %s",
                               idh_tarball_path, e)

        # unpack archive
        LOG.info("Unpack manager %s ", idh_tarball_filename)
        try:
            r.execute_command('tar xzf %s 2>&1' % idh_tarball_filename)
        except Exception as e:
            raise RuntimeError("Unable to unpack tgz %s",
                               idh_tarball_filename, e)

        # install idh
        LOG.debug("Install manager with %s : ", idh_install_cmd)
        inst_conf = _INST_CONF_TEMPLATE % (os_repo, idh_repo)
        r.write_file_to('%s/ui-installer/conf' % idh_dir, inst_conf)
        #TODO(alazarev) make timeout configurable (bug #1262897)
        r.execute_command(idh_install_cmd, timeout=3600)

        # fix nginx persimmions bug
        r.execute_command('sudo chmod o+x /var/lib/nginx/ /var/lib/nginx/tmp '
                          '/var/lib/nginx/tmp/client_body')

    # waiting start idh manager
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600
    LOG.debug("Waiting %s seconds for Manager to start : ", timeout)
    while timeout:
        try:
            telnetlib.Telnet(mng_instance.management_ip, 9443)
            break
        except IOError:
            timeout -= 2
            context.sleep(2)
    else:
        message = ("IDH Manager failed to start in %s minutes on node '%s' "
                   "of cluster '%s'"
                   % (timeout / 60, mng_instance.management_ip, cluster.name))
        LOG.error(message)
        raise iex.IntelPluginException(message)
Example #43
0
def install_manager(cluster):
    LOG.info("Starting Install Manager Process")
    mng_instance = u.get_instance(cluster, 'manager')

    idh_tarball_path = c_helper.get_config_value(
        cluster.cluster_configs.get('general'), c_helper.IDH_TARBALL_URL)

    idh_tarball_filename = idh_tarball_path.rsplit('/', 1)[-1]
    idh_dir = idh_tarball_filename[:idh_tarball_filename.find('.tar.gz')]
    LOG.info("IDH tgz will be retrieved from: \'%s\'", idh_tarball_path)

    idh_repo = c_helper.get_config_value(
        cluster.cluster_configs.get('general'), c_helper.IDH_REPO_URL)

    os_repo = c_helper.get_config_value(cluster.cluster_configs.get('general'),
                                        c_helper.OS_REPO_URL)

    idh_install_cmd = 'sudo ./%s/install.sh --mode=silent 2>&1' % idh_dir

    with mng_instance.remote() as r:
        LOG.info("Download IDH manager ")
        try:
            r.execute_command('curl -O %s 2>&1' % idh_tarball_path)
        except Exception as e:
            raise RuntimeError("Unable to download IDH manager from %s",
                               idh_tarball_path, e)

        # unpack archive
        LOG.info("Unpack manager %s ", idh_tarball_filename)
        try:
            r.execute_command('tar xzf %s 2>&1' % idh_tarball_filename)
        except Exception as e:
            raise RuntimeError("Unable to unpack tgz %s", idh_tarball_filename,
                               e)

        # install idh
        LOG.debug("Install manager with %s : ", idh_install_cmd)
        inst_conf = _INST_CONF_TEMPLATE % (os_repo, idh_repo)
        r.write_file_to('%s/ui-installer/conf' % idh_dir, inst_conf)
        #TODO(alazarev) make timeout configurable (bug #1262897)
        r.execute_command(idh_install_cmd, timeout=3600)

        # fix nginx persimmions bug
        r.execute_command('sudo chmod o+x /var/lib/nginx/ /var/lib/nginx/tmp '
                          '/var/lib/nginx/tmp/client_body')

    # waiting start idh manager
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600
    LOG.debug("Waiting %s seconds for Manager to start : ", timeout)
    while timeout:
        try:
            telnetlib.Telnet(mng_instance.management_ip, 9443)
            break
        except IOError:
            timeout -= 2
            context.sleep(2)
    else:
        message = ("IDH Manager failed to start in %s minutes on node '%s' "
                   "of cluster '%s'" %
                   (timeout / 60, mng_instance.management_ip, cluster.name))
        LOG.error(message)
        raise iex.IntelPluginException(message)
Example #44
0
 def _add_element(self, lst, i):
     context.sleep(rnd.uniform(0, 0.1))
     lst.append(i)
Example #45
0
def decommission_nodes(cluster, instances):
    dec_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [dn.fqdn() for dn in u.get_tasktrackers(cluster)]

    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)

    dec_dn_hosts = []
    for dec_host in dec_hosts:
        if dec_host in dn_hosts:
            dec_dn_hosts.append(dec_host)

    if dec_dn_hosts:
        client.services.hdfs.decommission_nodes(dec_dn_hosts)

        #TODO(alazarev) make timeout configurable (bug #1262897)
        timeout = 14400  # 4 hours
        cur_time = 0
        for host in dec_dn_hosts:
            while cur_time < timeout:
                if client.services.hdfs.get_datanode_status(
                        host) == 'Decomissioned':
                    break
                context.sleep(5)
                cur_time += 5
            else:
                LOG.warn("Failed to decomission node '%s' of cluster '%s' "
                         "in %s minutes" % (host, cluster.name, timeout / 60))

    client.nodes.stop(dec_hosts)

    # wait stop services
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600  # 10 minutes
    cur_time = 0
    for instance in instances:
        while cur_time < timeout:
            stopped = True
            if instance.fqdn() in dn_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-datanode status',
                    raise_when_error=False)
                if out.strip() != 'datanode is stopped':
                    stopped = False
                if out.strip() == 'datanode dead but pid file exists':
                    instance.remote().execute_command(
                        'sudo rm -f '
                        '/var/run/hadoop/hadoop-hadoop-datanode.pid')
            if instance.fqdn() in tt_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-tasktracker status',
                    raise_when_error=False)
                if out.strip() != 'tasktracker is stopped':
                    stopped = False
            if stopped:
                break
            else:
                context.sleep(5)
                cur_time += 5
        else:
            LOG.warn("Failed to stop services on node '%s' of cluster '%s' "
                     "in %s minutes" % (instance, cluster.name, timeout / 60))

    for node in dec_hosts:
        LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name))
        client.nodes.delete(node)
Example #46
0
 def _add_element(self, lst, i):
     context.sleep(rnd.uniform(0, 0.1))
     lst.append(i)