Example #1
0
class GearJobs(object):
    def __init__(self, logger, args):
        self.logger = logger
        self.gm_client = JSONGearmanClient(args.server)

    def send_pings(self, node_list):
        list_of_jobs = []
        failed_list = []
        job_data = {"hpcs_action": "STATS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_pings = self.gm_client.submit_multiple_jobs(
            list_of_jobs, background=False, wait_until_complete=True,
            poll_timeout=5.0
        )
        for ping in submitted_pings:
            if ping.state == 'UNKNOWN':
                # TODO: Gearman server failed, ignoring for now
                self.logger.error('Gearman Job server fail')
                continue
            if ping.timed_out:
                # Ping timeout
                failed_list.append(ping.job.task)
                continue
            if ping.result['hpcs_response'] == 'FAIL':
                # Error returned by Gearman
                failed_list.append(ping.job.task)
                continue

        return failed_list

    def send_repair(self, node_list):
        list_of_jobs = []
        repaired_list = []
        job_data = {"hpcs_action": "STATS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_pings = self.gm_client.submit_multiple_jobs(
            list_of_jobs, background=False, wait_until_complete=True,
            poll_timeout=5.0
        )
        for ping in submitted_pings:
            if ping.state == 'UNKNOWN':
                # TODO: Gearman server failed, ignoring for now
                self.logger.error('Gearman Job server fail')
                continue
            elif ping.timed_out:
                # Ping timeout
                continue
            elif ping.result['hpcs_response'] == 'FAIL':
                # Error returned by Gearman
                continue
            else:
                repaired_list.append(ping.job.task)

        return repaired_list
Example #2
0
    def _test_node(self, name):
        """ Run diags on node, blow it away if bad """
        server_list = []
        for server in cfg.CONF['gearman']['servers']:
            host, port = server.split(':')
            server_list.append({'host': host,
                                'port': int(port),
                                'keyfile': cfg.CONF['gearman']['ssl_key'],
                                'certfile': cfg.CONF['gearman']['ssl_cert'],
                                'ca_certs': cfg.CONF['gearman']['ssl_ca'],
                                'keepalive': cfg.CONF['gearman']['keepalive'],
                                'keepcnt': cfg.CONF['gearman']['keepcnt'],
                                'keepidle': cfg.CONF['gearman']['keepidle'],
                                'keepintvl': cfg.CONF['gearman']['keepintvl']})
        gm_client = JSONGearmanClient(server_list)

        job_data = {'hpcs_action': 'DIAGNOSTICS'}
        job_status = gm_client.submit_job(
            str(name), job_data, background=False, wait_until_complete=True,
            max_retries=10, poll_timeout=10
        )
        if job_status.state == JOB_UNKNOWN:
            # Gearman server connect fail, count as bad node because we can't
            # tell if it really is working
            LOG.error('Could not talk to gearman server')
            return False
        if job_status.timed_out:
            LOG.warning('Timeout getting diags from {0}'.format(name))
            return False
        LOG.debug(job_status.result)
        # Would only happen if DIAGNOSTICS call not supported
        if job_status.result['hpcs_response'] == 'FAIL':
            return True

        if job_status.result['network'] == 'FAIL':
            return False

        gearman_count = 0
        gearman_fail = 0
        for gearman_test in job_status.result['gearman']:
            gearman_count += 1
            if gearman_test['status'] == 'FAIL':
                LOG.info(
                    'Device {0} cannot talk to gearman {1}'
                    .format(name, gearman_test['host'])
                )
                gearman_fail += 1
        # Need 2/3rds gearman up
        max_fail_count = gearman_count / 3
        if gearman_fail > max_fail_count:
            return False
        return True
Example #3
0
 def __init__(self):
     server_list = []
     for server in cfg.CONF['gearman']['servers']:
         host, port = server.split(':')
         server_list.append({
             'host': host,
             'port': int(port),
             'keyfile': cfg.CONF['gearman']['ssl_key'],
             'certfile': cfg.CONF['gearman']['ssl_cert'],
             'ca_certs': cfg.CONF['gearman']['ssl_ca'],
             'keepalive': cfg.CONF['gearman']['keepalive'],
             'keepcnt': cfg.CONF['gearman']['keepcnt'],
             'keepidle': cfg.CONF['gearman']['keepidle'],
             'keepintvl': cfg.CONF['gearman']['keepintvl']
         })
     self.gearman_client = JSONGearmanClient(server_list)
Example #4
0
    def __init__(self, host, lbid):
        self.host = host
        self.lbid = lbid

        server_list = []
        for server in conf.gearman.server:
            ghost, gport = server.split(':')
            server_list.append({
                'host': ghost,
                'port': int(gport),
                'keyfile': conf.gearman.ssl_key,
                'certfile': conf.gearman.ssl_cert,
                'ca_certs': conf.gearman.ssl_ca,
                'keepalive': conf.gearman.keepalive,
                'keepcnt': conf.gearman.keepcnt,
                'keepidle': conf.gearman.keepidle,
                'keepintvl': conf.gearman.keepintvl
            })
        self.gearman_client = JSONGearmanClient(server_list)
Example #5
0
    def __init__(self):
        self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout']
        self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry']

        server_list = []
        for server in cfg.CONF['gearman']['servers']:
            host, port = server.split(':')
            server_list.append({
                'host': host,
                'port': int(port),
                'keyfile': cfg.CONF['gearman']['ssl_key'],
                'certfile': cfg.CONF['gearman']['ssl_cert'],
                'ca_certs': cfg.CONF['gearman']['ssl_ca'],
                'keepalive': cfg.CONF['gearman']['keepalive'],
                'keepcnt': cfg.CONF['gearman']['keepcnt'],
                'keepidle': cfg.CONF['gearman']['keepidle'],
                'keepintvl': cfg.CONF['gearman']['keepintvl']
            })
        self.gm_client = JSONGearmanClient(server_list)
Example #6
0
 def __init__(self):
     server_list = []
     for server in cfg.CONF['gearman']['servers']:
         host, port = server.split(':')
         server_list.append({'host': host,
                             'port': int(port),
                             'keyfile': cfg.CONF['gearman']['ssl_key'],
                             'certfile': cfg.CONF['gearman']['ssl_cert'],
                             'ca_certs': cfg.CONF['gearman']['ssl_ca'],
                             'keepalive': cfg.CONF['gearman']['keepalive'],
                             'keepcnt': cfg.CONF['gearman']['keepcnt'],
                             'keepidle': cfg.CONF['gearman']['keepidle'],
                             'keepintvl': cfg.CONF['gearman']['keepintvl']
                             })
     self.gearman_client = JSONGearmanClient(server_list)
Example #7
0
    def __init__(self, host, lbid):
        self.host = host
        self.lbid = lbid

        server_list = []
        for server in conf.gearman.server:
            ghost, gport = server.split(':')
            server_list.append({'host': ghost,
                                'port': int(gport),
                                'keyfile': conf.gearman.ssl_key,
                                'certfile': conf.gearman.ssl_cert,
                                'ca_certs': conf.gearman.ssl_ca,
                                'keepalive': conf.gearman.keepalive,
                                'keepcnt': conf.gearman.keepcnt,
                                'keepidle': conf.gearman.keepidle,
                                'keepintvl': conf.gearman.keepintvl})
        self.gearman_client = JSONGearmanClient(server_list)
Example #8
0
    def __init__(self):
        self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout']
        self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry']

        server_list = []
        for server in cfg.CONF['gearman']['servers']:
            host, port = server.split(':')
            server_list.append({'host': host,
                                'port': int(port),
                                'keyfile': cfg.CONF['gearman']['ssl_key'],
                                'certfile': cfg.CONF['gearman']['ssl_cert'],
                                'ca_certs': cfg.CONF['gearman']['ssl_ca'],
                                'keepalive': cfg.CONF['gearman']['keepalive'],
                                'keepcnt': cfg.CONF['gearman']['keepcnt'],
                                'keepidle': cfg.CONF['gearman']['keepidle'],
                                'keepintvl': cfg.CONF['gearman']['keepintvl']
                                })
        self.gm_client = JSONGearmanClient(server_list)
Example #9
0
class GearmanClientThread(object):
    def __init__(self, host, lbid):
        self.host = host
        self.lbid = lbid

        server_list = []
        for server in conf.gearman.server:
            ghost, gport = server.split(':')
            server_list.append({'host': ghost,
                                'port': int(gport),
                                'keyfile': conf.gearman.ssl_key,
                                'certfile': conf.gearman.ssl_cert,
                                'ca_certs': conf.gearman.ssl_ca,
                                'keepalive': conf.gearman.keepalive,
                                'keepcnt': conf.gearman.keepcnt,
                                'keepidle': conf.gearman.keepidle,
                                'keepintvl': conf.gearman.keepintvl})
        self.gearman_client = JSONGearmanClient(server_list)

    def send_assign(self, data):
        NULL = None  # For pep8
        with db_session() as session:
            device = session.query(Device).\
                filter(Device.name == data).first()
            if device is None:
                LOG.error(
                    "VIP assign have been given non existent device {0}"
                    .format(data)
                )
                session.rollback()
                return False
            if not self.lbid:
                vip = session.query(Vip).\
                    filter(Vip.device == NULL).\
                    with_lockmode('update').\
                    first()
                if vip is None:
                    errmsg = 'Floating IP assign failed (none available)'
                    LOG.error(
                        "Failed to assign IP to device {0} (none available)"
                        .format(data)
                    )
                    self._set_error(device.id, errmsg, session)
                    session.commit()
                    return False
            else:
                vip = session.query(Vip).\
                    filter(Vip.id == self.lbid).first()
                if vip is None:
                    errmsg = 'Cannot find existing floating IP'
                    LOG.error(
                        "Failed to assign IP to device {0}"
                        .format(data)
                    )
                    self._set_error(device.id, errmsg, session)
                    session.commit()
                    return False
            vip.device = device.id
            vip_id = vip.id
            vip_ip = vip.ip
            session.commit()
        ip_str = str(ipaddress.IPv4Address(vip_ip))

        job_data = {
            'action': 'ASSIGN_IP',
            'name': data,
            'ip': ip_str
        }
        status, response = self._send_message(job_data, 'response')
        if status:
            return True
        elif self.lbid:
            LOG.error(
                "Failed to assign IP {0} to device {1}"
                .format(ip_str, data)
            )
        else:
            LOG.error(
                "Failed to assign IP {0} to device {1}"
                .format(ip_str, data)
            )
            # set to device 0 to make sure it won't be used again
            with db_session() as session:
                vip = session.query(Vip).filter(Vip.id == vip_id).first()
                vip.device = 0
                session.commit()
            submit_vip_job('REMOVE', None, ip_str)
        return False

    def send_remove(self, data=None):
        job_data = {
            'action': 'DELETE_IP',
            'ip': self.lbid
        }
        ip_int = int(ipaddress.IPv4Address(unicode(self.lbid)))
        for x in xrange(0, 5):
            LOG.info(
                'Attempt to delete IP {0} #{1}'
                .format(self.lbid, x)
            )
            status, response = self._send_message(job_data, 'response')
            if status:
                break
        with db_session() as session:
            if not status:
                LOG.error(
                    "Failed to delete IP {0}"
                    .format(self.lbid)
                )
                # Set to 0 to mark as something that needs cleaning up
                # but cannot be used again
                vip = session.query(Vip).\
                    filter(Vip.ip == ip_int).first()
                vip.device = 0
            else:
                session.query(Vip).\
                    filter(Vip.ip == ip_int).delete()
                counter = session.query(Counters).\
                    filter(Counters.name == 'vips_deleted').first()
                counter.value += 1
            session.commit()

    def send_delete(self, data):
        with db_session() as session:
            count = session.query(
                LoadBalancer
            ).join(LoadBalancer.devices).\
                filter(Device.id == data).\
                filter(LoadBalancer.id != self.lbid).\
                filter(LoadBalancer.status != 'DELETED').\
                filter(LoadBalancer.status != 'PENDING_DELETE').\
                count()
            if count >= 1:
                # This is an update message because we want to retain the
                # remaining LB
                keep_lb = session.query(LoadBalancer).\
                    join(LoadBalancer.nodes).\
                    join(LoadBalancer.devices).\
                    filter(Device.id == data).\
                    filter(LoadBalancer.id != self.lbid).\
                    filter(LoadBalancer.status != 'DELETED').\
                    filter(LoadBalancer.status != 'PENDING_DELETE').\
                    first()
                job_data = {
                    'hpcs_action': 'UPDATE',
                    'loadBalancers': [{
                        'name': keep_lb.name,
                        'protocol': keep_lb.protocol,
                        'algorithm': keep_lb.algorithm,
                        'port': keep_lb.port,
                        'nodes': []
                    }]
                }
                for node in keep_lb.nodes:
                    if not node.enabled:
                        continue
                    condition = 'ENABLED'
                    node_data = {
                        'id': node.id, 'port': node.port,
                        'address': node.address, 'weight': node.weight,
                        'condition': condition
                    }
                    job_data['loadBalancers'][0]['nodes'].append(node_data)
            else:
                # This is a delete
                dev = session.query(Device.name).\
                    filter(Device.id == data).first()
                vip = session.query(Vip).\
                    filter(Vip.device == data).first()
                if vip:
                    submit_vip_job(
                        'REMOVE', dev.name, str(ipaddress.IPv4Address(vip.ip))
                    )
                job_data = {"hpcs_action": "DELETE"}

            status, response = self._send_message(job_data, 'hpcs_response')
            lb = session.query(LoadBalancer).\
                filter(LoadBalancer.id == self.lbid).\
                first()
            if not status:
                LOG.error(
                    "Failed Gearman delete for LB {0}".format(lb.id)
                )
                self._set_error(data, response, session)
            lb.status = 'DELETED'
            tenant_id = lb.tenantid

            if count == 0:
                # Device should never be used again
                device = session.query(Device).\
                    filter(Device.id == data).first()
                device.status = 'DELETED'
            # Remove LB-device join
            session.execute(loadbalancers_devices.delete().where(
                loadbalancers_devices.c.loadbalancer == lb.id
            ))
            session.query(Node).\
                filter(Node.lbid == lb.id).delete()
            session.query(HealthMonitor).\
                filter(HealthMonitor.lbid == lb.id).delete()
            counter = session.query(Counters).\
                filter(Counters.name == 'loadbalancers_deleted').first()
            counter.value += 1
            session.commit()

            # Notify billing of the LB deletion
            update_mnb('lbaas.instance.delete', self.lbid, tenant_id)

    def _set_error(self, device_id, errmsg, session):
        lbs = session.query(
            LoadBalancer
        ).join(LoadBalancer.nodes).\
            join(LoadBalancer.devices).\
            filter(Device.id == device_id).\
            filter(LoadBalancer.status != 'DELETED').\
            all()
        device = session.query(Device).\
            filter(Device.id == device_id).\
            first()
        if device is None:
            # Device already deleted, probably a race between the OFFLINE check
            # and auto-failover
            return
        device.status = 'ERROR'
        counter = session.query(Counters).\
            filter(Counters.name == 'loadbalancers_error').first()
        for lb in lbs:
            counter.value += 1
            lb.status = 'ERROR'
            lb.errmsg = errmsg

    def send_archive(self, data):
        with db_session() as session:
            lb = session.query(LoadBalancer).\
                filter(LoadBalancer.id == self.lbid).\
                first()
            job_data = {
                'hpcs_action': 'ARCHIVE',
                'hpcs_object_store_basepath': data['objectStoreBasePath'],
                'hpcs_object_store_endpoint': data['objectStoreEndpoint'],
                'hpcs_object_store_token': data['authToken'],
                'hpcs_object_store_type': data['objectStoreType'],
                'loadBalancers': [{
                    'id': str(lb.id),
                    'name': lb.name,
                    'protocol': lb.protocol
                }]
            }
            status, response = self._send_message(job_data, 'hpcs_response')
            device = session.query(Device).\
                filter(Device.id == data['deviceid']).\
                first()
            if status:
                device.errmsg = 'Log archive successful'
            else:
                device.errmsg = 'Log archive failed: {0}'.format(response)
            lb.status = 'ACTIVE'
            counter = session.query(Counters).\
                filter(Counters.name == 'log_archives').first()
            counter.value += 1
            session.commit()

    def send_update(self, data):
        with db_session() as session:
            lbs = session.query(
                LoadBalancer
            ).join(LoadBalancer.nodes).\
                join(LoadBalancer.devices).\
                filter(Device.id == data).\
                filter(LoadBalancer.status != 'DELETED').\
                all()
            job_data = {
                'hpcs_action': 'UPDATE',
                'loadBalancers': []
            }

            degraded = []
            if lbs is None:
                LOG.error(
                    'Attempting to send empty LB data for device {0} ({1}), '
                    'something went wrong'.format(data, self.host)
                )
                self._set_error(data, "LB config error", session)
                session.commit()
                return

            for lb in lbs:
                lb_data = {
                    'name': lb.name,
                    'protocol': lb.protocol,
                    'algorithm': lb.algorithm,
                    'port': lb.port,
                    'nodes': [],
                    'monitor': {}
                }
                for node in lb.nodes:
                    if not node.enabled:
                        continue
                    condition = 'ENABLED'
                    backup = 'FALSE'
                    if node.backup != 0:
                        backup = 'TRUE'
                    node_data = {
                        'id': node.id, 'port': node.port,
                        'address': node.address, 'weight': node.weight,
                        'condition': condition, 'backup': backup
                    }

                    lb_data['nodes'].append(node_data)
                    # Track if we have a DEGRADED LB
                    if node.status == 'ERROR':
                        degraded.append(lb.id)

                # Add a default health monitor if one does not exist
                monitor = session.query(HealthMonitor).\
                    filter(HealthMonitor.lbid == lb.id).first()

                if monitor is None:
                    # Set it to a default configuration
                    monitor = HealthMonitor(
                        lbid=lb.id, type="CONNECT", delay=30,
                        timeout=30, attempts=2, path=None
                    )
                    session.add(monitor)
                    session.flush()

                monitor_data = {
                    'type': monitor.type,
                    'delay': monitor.delay,
                    'timeout': monitor.timeout,
                    'attempts': monitor.attempts
                }
                if monitor.path is not None:
                    monitor_data['path'] = monitor.path

                # All new LBs created since these options were supported
                # will have default values in the DB. Pre-existing LBs will
                # not have any values, so we need to check for that.
                if any([lb.timeout, lb.retries]):
                    lb_data['options'] = {
                        'client_timeout': lb.timeout,
                        'server_timeout': lb.timeout,
                        'connect_timeout': lb.timeout,
                        'connect_retries': lb.retries
                    }

                lb_data['monitor'] = monitor_data
                job_data['loadBalancers'].append(lb_data)

            # Update the worker
            mnb_data = {}
            status, response = self._send_message(job_data, 'hpcs_response')
            if not status:
                self._set_error(data, response, session)
            else:
                for lb in lbs:
                    if lb.id in degraded:
                        lb.status = 'DEGRADED'
                        lb.errmsg = "A node on the load balancer has failed"
                    elif lb.status == 'ERROR':
                        # Do nothing because something else failed in the mean
                        # time
                        pass
                    elif lb.status == 'BUILD':
                        # Do nothing if a new device, stay in BUILD state until
                        # floating IP assign finishes
                        if len(lbs) > 1:
                            lb.status = 'ACTIVE'
                            if lb.id == self.lbid:
                                # This is the new LB being added to a device.
                                # We don't have to assign a vip so we can
                                # notify billing of the LB creation (once the
                                # DB is updated)
                                mnb_data["lbid"] = lb.id
                                mnb_data["tenantid"] = lb.tenantid
                    else:
                        lb.status = 'ACTIVE'
                        lb.errmsg = None
            device = session.query(Device).\
                filter(Device.id == data).\
                first()
            if device is None:
                # Shouldn't hit here, but just to be safe
                session.commit()
                return
            if device.status == 'BUILD' and len(lbs) > 1:
                device.status = 'ONLINE'
            device_name = device.name
            device_status = device.status
            counter = session.query(Counters).\
                filter(Counters.name == 'loadbalancers_updated').first()
            counter.value += 1
            session.commit()
            if device_status == 'BUILD':
                submit_vip_job(
                    'ASSIGN', device_name, None
                )

            # Send the MnB create if needed
            if "lbid" in mnb_data:
                update_mnb('lbaas.instance.create',
                           mnb_data["lbid"],
                           mnb_data["tenantid"])

    def _send_message(self, message, response_name):
        job_status = self.gearman_client.submit_job(
            self.host, message, background=False, wait_until_complete=True,
            max_retries=10, poll_timeout=120.0
        )
        if job_status.state == 'UNKNOWN':
            # Gearman server connection failed
            LOG.error('Could not talk to gearman server')
            return False, "System error communicating with load balancer"
        if job_status.timed_out:
            # Job timed out
            LOG.warning(
                'Gearman timeout talking to {0}'.format(self.host)
            )
            return False, "Timeout error communicating with load balancer"
        LOG.debug(job_status.result)
        if 'badRequest' in job_status.result:
            error = job_status.result['badRequest']['validationErrors']
            return False, error['message']
        if job_status.result[response_name] == 'FAIL':
            # Worker says 'no'
            if 'hpcs_error' in job_status.result:
                error = job_status.result['hpcs_error']
            else:
                error = 'Load Balancer error'
            LOG.error(
                'Gearman error response from {0}: {1}'.format(self.host, error)
            )
            return False, error
        LOG.info('Gearman success from {0}'.format(self.host))
        return True, job_status.result
Example #10
0
class GearmanWork(object):

    def __init__(self):
        server_list = []
        for server in cfg.CONF['gearman']['servers']:
            host, port = server.split(':')
            server_list.append({'host': host,
                                'port': int(port),
                                'keyfile': cfg.CONF['gearman']['ssl_key'],
                                'certfile': cfg.CONF['gearman']['ssl_cert'],
                                'ca_certs': cfg.CONF['gearman']['ssl_ca'],
                                'keepalive': cfg.CONF['gearman']['keepalive'],
                                'keepcnt': cfg.CONF['gearman']['keepcnt'],
                                'keepidle': cfg.CONF['gearman']['keepidle'],
                                'keepintvl': cfg.CONF['gearman']['keepintvl']
                                })
        self.gearman_client = JSONGearmanClient(server_list)

    def send_delete_message(self, message):
        LOG.info("Sending %d gearman messages", len(message))
        job_status = self.gearman_client.submit_multiple_jobs(
            message, background=False, wait_until_complete=True,
            max_retries=10, poll_timeout=30.0
        )
        delete_count = 0
        for status in job_status:
            if status.state == JOB_UNKNOWN:
                LOG.error('Gearman Job server fail')
                continue
            if status.timed_out:
                LOG.error('Gearman timeout whilst deleting device')
                continue
            if status.result['response'] == 'FAIL':
                LOG.error(
                    'Pool manager failed to delete a device, removing from DB'
                )

            delete_count += 1
            with db_session() as session:
                session.query(Device).\
                    filter(Device.name == status.result['name']).delete()
                session.commit()

        LOG.info('%d freed devices delete from pool', delete_count)

    def send_vips_message(self, message):
        # TODO: make this gearman part more async, not wait for all builds
        LOG.info("Sending %d gearman messages", len(message))
        job_status = self.gearman_client.submit_multiple_jobs(
            message, background=False, wait_until_complete=True,
            max_retries=10, poll_timeout=3600.0
        )
        built_count = 0
        for status in job_status:
            if status.state == JOB_UNKNOWN:
                LOG.error('Gearman Job server fail')
                continue
            if status.timed_out:
                LOG.error('Gearman timeout whilst building vip')
                continue
            if status.result['response'] == 'FAIL':
                LOG.error('Pool manager failed to build a vip')
                continue

            built_count += 1
            try:
                self._add_vip(status.result)
            except:
                LOG.exception(
                    'Could not add vip to DB, node data: {0}'
                    .format(status.result)
                )
        LOG.info(
            '{vips} vips built and added to pool'.format(vips=built_count)
        )

    def send_create_message(self, message):
        # TODO: make this gearman part more async, not wait for all builds
        LOG.info("Sending {0} gearman messages".format(len(message)))
        job_status = self.gearman_client.submit_multiple_jobs(
            message, background=False, wait_until_complete=True,
            max_retries=10, poll_timeout=3600.0
        )
        built_count = 0
        for status in job_status:
            if status.state == JOB_UNKNOWN:
                LOG.error('Gearman Job server fail')
                continue
            if status.timed_out:
                LOG.error('Gearman timeout whilst building device')
                continue
            if status.result['response'] == 'FAIL':
                LOG.error('Pool manager failed to build a device')
                if 'name' in status.result:
                    self._add_bad_node(status.result)
                continue

            built_count += 1
            try:
                self._add_node(status.result)
            except:
                LOG.exception(
                    'Could not add node to DB, node data: {0}'
                    .format(status.result)
                )
        LOG.info(
            '{nodes} devices built and added to pool'.format(nodes=built_count)
        )

    def _add_vip(self, data):
        LOG.info('Adding vip {0} to DB'.format(data['ip']))
        vip = Vip()
        vip.ip = int(ipaddress.IPv4Address(unicode(data['ip'])))
        with db_session() as session:
            session.add(vip)
            session.commit()

    def _add_node(self, data):
        LOG.info('Adding device {0} to DB'.format(data['name']))
        device = Device()
        device.name = data['name']
        device.publicIpAddr = data['addr']
        # TODO: kill this field, make things use publicIpAddr instead
        device.floatingIpAddr = data['addr']
        device.az = data['az']
        device.type = data['type']
        device.pingCount = 0
        device.status = 'OFFLINE'
        device.created = None
        with db_session() as session:
            session.add(device)
            session.commit()

    def _add_bad_node(self, data):
        LOG.info(
            'Adding bad device {0} to DB to be deleted'.format(data['name'])
        )
        device = Device()
        device.name = data['name']
        device.publicIpAddr = data['addr']
        # TODO: kill this field, make things use publicIpAddr instead
        device.floatingIpAddr = data['addr']
        device.az = data['az']
        device.type = data['type']
        device.pingCount = 0
        device.status = 'DELETED'
        device.created = None
        with db_session() as session:
            session.add(device)
            session.commit()
Example #11
0
class GearmanClientThread(object):
    def __init__(self, host, lbid):
        self.host = host
        self.lbid = lbid

        server_list = []
        for server in conf.gearman.server:
            ghost, gport = server.split(':')
            server_list.append({
                'host': ghost,
                'port': int(gport),
                'keyfile': conf.gearman.ssl_key,
                'certfile': conf.gearman.ssl_cert,
                'ca_certs': conf.gearman.ssl_ca,
                'keepalive': conf.gearman.keepalive,
                'keepcnt': conf.gearman.keepcnt,
                'keepidle': conf.gearman.keepidle,
                'keepintvl': conf.gearman.keepintvl
            })
        self.gearman_client = JSONGearmanClient(server_list)

    def send_assign(self, data):
        NULL = None  # For pep8
        with db_session() as session:
            device = session.query(Device).\
                filter(Device.name == data).first()
            if device is None:
                LOG.error("VIP assign have been given non existent device {0}".
                          format(data))
                session.rollback()
                return False
            if not self.lbid:
                vip = session.query(Vip).\
                    filter(Vip.device == NULL).\
                    with_lockmode('update').\
                    first()
                if vip is None:
                    errmsg = 'Floating IP assign failed (none available)'
                    LOG.error(
                        "Failed to assign IP to device {0} (none available)".
                        format(data))
                    self._set_error(device.id, errmsg, session)
                    session.commit()
                    return False
            else:
                vip = session.query(Vip).\
                    filter(Vip.id == self.lbid).first()
                if vip is None:
                    errmsg = 'Cannot find existing floating IP'
                    LOG.error("Failed to assign IP to device {0}".format(data))
                    self._set_error(device.id, errmsg, session)
                    session.commit()
                    return False
            vip.device = device.id
            vip_id = vip.id
            vip_ip = vip.ip
            session.commit()
        ip_str = str(ipaddress.IPv4Address(vip_ip))

        job_data = {'action': 'ASSIGN_IP', 'name': data, 'ip': ip_str}
        status, response = self._send_message(job_data, 'response')
        if status:
            return True
        elif self.lbid:
            LOG.error("Failed to assign IP {0} to device {1}".format(
                ip_str, data))
        else:
            LOG.error("Failed to assign IP {0} to device {1}".format(
                ip_str, data))
            # set to device 0 to make sure it won't be used again
            with db_session() as session:
                vip = session.query(Vip).filter(Vip.id == vip_id).first()
                vip.device = 0
                session.commit()
            submit_vip_job('REMOVE', None, ip_str)
        return False

    def send_remove(self, data=None):
        job_data = {'action': 'DELETE_IP', 'ip': self.lbid}
        ip_int = int(ipaddress.IPv4Address(unicode(self.lbid)))
        for x in xrange(0, 5):
            LOG.info('Attempt to delete IP {0} #{1}'.format(self.lbid, x))
            status, response = self._send_message(job_data, 'response')
            if status:
                break
        with db_session() as session:
            if not status:
                LOG.error("Failed to delete IP {0}".format(self.lbid))
                # Set to 0 to mark as something that needs cleaning up
                # but cannot be used again
                vip = session.query(Vip).\
                    filter(Vip.ip == ip_int).first()
                vip.device = 0
            else:
                session.query(Vip).\
                    filter(Vip.ip == ip_int).delete()
                counter = session.query(Counters).\
                    filter(Counters.name == 'vips_deleted').first()
                counter.value += 1
            session.commit()

    def send_delete(self, data):
        with db_session() as session:
            count = session.query(
                LoadBalancer
            ).join(LoadBalancer.devices).\
                filter(Device.id == data).\
                filter(LoadBalancer.id != self.lbid).\
                filter(LoadBalancer.status != 'DELETED').\
                filter(LoadBalancer.status != 'PENDING_DELETE').\
                count()
            if count >= 1:
                # This is an update message because we want to retain the
                # remaining LB
                keep_lb = session.query(LoadBalancer).\
                    join(LoadBalancer.nodes).\
                    join(LoadBalancer.devices).\
                    filter(Device.id == data).\
                    filter(LoadBalancer.id != self.lbid).\
                    filter(LoadBalancer.status != 'DELETED').\
                    filter(LoadBalancer.status != 'PENDING_DELETE').\
                    first()
                job_data = {
                    'hpcs_action':
                    'UPDATE',
                    'loadBalancers': [{
                        'name': keep_lb.name,
                        'protocol': keep_lb.protocol,
                        'algorithm': keep_lb.algorithm,
                        'port': keep_lb.port,
                        'nodes': []
                    }]
                }
                for node in keep_lb.nodes:
                    if not node.enabled:
                        continue
                    condition = 'ENABLED'
                    node_data = {
                        'id': node.id,
                        'port': node.port,
                        'address': node.address,
                        'weight': node.weight,
                        'condition': condition
                    }
                    job_data['loadBalancers'][0]['nodes'].append(node_data)
            else:
                # This is a delete
                dev = session.query(Device.name).\
                    filter(Device.id == data).first()
                vip = session.query(Vip).\
                    filter(Vip.device == data).first()
                if vip:
                    submit_vip_job('REMOVE', dev.name,
                                   str(ipaddress.IPv4Address(vip.ip)))
                job_data = {"hpcs_action": "DELETE"}

            status, response = self._send_message(job_data, 'hpcs_response')
            lb = session.query(LoadBalancer).\
                filter(LoadBalancer.id == self.lbid).\
                first()
            if not status:
                LOG.error("Failed Gearman delete for LB {0}".format(lb.id))
                self._set_error(data, response, session)
            lb.status = 'DELETED'
            tenant_id = lb.tenantid

            if count == 0:
                # Device should never be used again
                device = session.query(Device).\
                    filter(Device.id == data).first()
                device.status = 'DELETED'
            # Remove LB-device join
            session.execute(loadbalancers_devices.delete().where(
                loadbalancers_devices.c.loadbalancer == lb.id))
            session.query(Node).\
                filter(Node.lbid == lb.id).delete()
            session.query(HealthMonitor).\
                filter(HealthMonitor.lbid == lb.id).delete()
            counter = session.query(Counters).\
                filter(Counters.name == 'loadbalancers_deleted').first()
            counter.value += 1
            session.commit()

            # Notify billing of the LB deletion
            update_mnb('lbaas.instance.delete', self.lbid, tenant_id)

    def _set_error(self, device_id, errmsg, session):
        lbs = session.query(
            LoadBalancer
        ).join(LoadBalancer.nodes).\
            join(LoadBalancer.devices).\
            filter(Device.id == device_id).\
            filter(LoadBalancer.status != 'DELETED').\
            all()
        device = session.query(Device).\
            filter(Device.id == device_id).\
            first()
        if device is None:
            # Device already deleted, probably a race between the OFFLINE check
            # and auto-failover
            return
        device.status = 'ERROR'
        counter = session.query(Counters).\
            filter(Counters.name == 'loadbalancers_error').first()
        for lb in lbs:
            counter.value += 1
            lb.status = 'ERROR'
            lb.errmsg = errmsg

    def send_archive(self, data):
        with db_session() as session:
            lb = session.query(LoadBalancer).\
                filter(LoadBalancer.id == self.lbid).\
                first()
            job_data = {
                'hpcs_action':
                'ARCHIVE',
                'hpcs_object_store_basepath':
                data['objectStoreBasePath'],
                'hpcs_object_store_endpoint':
                data['objectStoreEndpoint'],
                'hpcs_object_store_token':
                data['authToken'],
                'hpcs_object_store_type':
                data['objectStoreType'],
                'loadBalancers': [{
                    'id': str(lb.id),
                    'name': lb.name,
                    'protocol': lb.protocol
                }]
            }
            status, response = self._send_message(job_data, 'hpcs_response')
            device = session.query(Device).\
                filter(Device.id == data['deviceid']).\
                first()
            if status:
                device.errmsg = 'Log archive successful'
            else:
                device.errmsg = 'Log archive failed: {0}'.format(response)
            lb.status = 'ACTIVE'
            counter = session.query(Counters).\
                filter(Counters.name == 'log_archives').first()
            counter.value += 1
            session.commit()

    def send_update(self, data):
        with db_session() as session:
            lbs = session.query(
                LoadBalancer
            ).join(LoadBalancer.nodes).\
                join(LoadBalancer.devices).\
                filter(Device.id == data).\
                filter(LoadBalancer.status != 'DELETED').\
                all()
            job_data = {'hpcs_action': 'UPDATE', 'loadBalancers': []}

            degraded = []
            if lbs is None:
                LOG.error(
                    'Attempting to send empty LB data for device {0} ({1}), '
                    'something went wrong'.format(data, self.host))
                self._set_error(data, "LB config error", session)
                session.commit()
                return

            for lb in lbs:
                lb_data = {
                    'name': lb.name,
                    'protocol': lb.protocol,
                    'algorithm': lb.algorithm,
                    'port': lb.port,
                    'nodes': [],
                    'monitor': {}
                }
                for node in lb.nodes:
                    if not node.enabled:
                        continue
                    condition = 'ENABLED'
                    backup = 'FALSE'
                    if node.backup != 0:
                        backup = 'TRUE'
                    node_data = {
                        'id': node.id,
                        'port': node.port,
                        'address': node.address,
                        'weight': node.weight,
                        'condition': condition,
                        'backup': backup
                    }

                    lb_data['nodes'].append(node_data)
                    # Track if we have a DEGRADED LB
                    if node.status == 'ERROR':
                        degraded.append(lb.id)

                # Add a default health monitor if one does not exist
                monitor = session.query(HealthMonitor).\
                    filter(HealthMonitor.lbid == lb.id).first()

                if monitor is None:
                    # Set it to a default configuration
                    monitor = HealthMonitor(lbid=lb.id,
                                            type="CONNECT",
                                            delay=30,
                                            timeout=30,
                                            attempts=2,
                                            path=None)
                    session.add(monitor)
                    session.flush()

                monitor_data = {
                    'type': monitor.type,
                    'delay': monitor.delay,
                    'timeout': monitor.timeout,
                    'attempts': monitor.attempts
                }
                if monitor.path is not None:
                    monitor_data['path'] = monitor.path

                # All new LBs created since these options were supported
                # will have default values in the DB. Pre-existing LBs will
                # not have any values, so we need to check for that.
                if any([lb.timeout, lb.retries]):
                    lb_data['options'] = {
                        'client_timeout': lb.timeout,
                        'server_timeout': lb.timeout,
                        'connect_timeout': lb.timeout,
                        'connect_retries': lb.retries
                    }

                lb_data['monitor'] = monitor_data
                job_data['loadBalancers'].append(lb_data)

            # Update the worker
            mnb_data = {}
            status, response = self._send_message(job_data, 'hpcs_response')
            if not status:
                self._set_error(data, response, session)
            else:
                for lb in lbs:
                    if lb.id in degraded:
                        lb.status = 'DEGRADED'
                        lb.errmsg = "A node on the load balancer has failed"
                    elif lb.status == 'ERROR':
                        # Do nothing because something else failed in the mean
                        # time
                        pass
                    elif lb.status == 'BUILD':
                        # Do nothing if a new device, stay in BUILD state until
                        # floating IP assign finishes
                        if len(lbs) > 1:
                            lb.status = 'ACTIVE'
                            if lb.id == self.lbid:
                                # This is the new LB being added to a device.
                                # We don't have to assign a vip so we can
                                # notify billing of the LB creation (once the
                                # DB is updated)
                                mnb_data["lbid"] = lb.id
                                mnb_data["tenantid"] = lb.tenantid
                    else:
                        lb.status = 'ACTIVE'
                        lb.errmsg = None
            device = session.query(Device).\
                filter(Device.id == data).\
                first()
            if device is None:
                # Shouldn't hit here, but just to be safe
                session.commit()
                return
            if device.status == 'BUILD' and len(lbs) > 1:
                device.status = 'ONLINE'
            device_name = device.name
            device_status = device.status
            counter = session.query(Counters).\
                filter(Counters.name == 'loadbalancers_updated').first()
            counter.value += 1
            session.commit()
            if device_status == 'BUILD':
                submit_vip_job('ASSIGN', device_name, None)

            # Send the MnB create if needed
            if "lbid" in mnb_data:
                update_mnb('lbaas.instance.create', mnb_data["lbid"],
                           mnb_data["tenantid"])

    def _send_message(self, message, response_name):
        job_status = self.gearman_client.submit_job(self.host,
                                                    message,
                                                    background=False,
                                                    wait_until_complete=True,
                                                    max_retries=10,
                                                    poll_timeout=120.0)
        if job_status.state == 'UNKNOWN':
            # Gearman server connection failed
            LOG.error('Could not talk to gearman server')
            return False, "System error communicating with load balancer"
        if job_status.timed_out:
            # Job timed out
            LOG.warning('Gearman timeout talking to {0}'.format(self.host))
            return False, "Timeout error communicating with load balancer"
        LOG.debug(job_status.result)
        if 'badRequest' in job_status.result:
            error = job_status.result['badRequest']['validationErrors']
            return False, error['message']
        if job_status.result[response_name] == 'FAIL':
            # Worker says 'no'
            if 'hpcs_error' in job_status.result:
                error = job_status.result['hpcs_error']
            else:
                error = 'Load Balancer error'
            LOG.error('Gearman error response from {0}: {1}'.format(
                self.host, error))
            return False, error
        LOG.info('Gearman success from {0}'.format(self.host))
        return True, job_status.result
Example #12
0
class GearJobs(object):
    def __init__(self):
        self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout']
        self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry']

        server_list = []
        for server in cfg.CONF['gearman']['servers']:
            host, port = server.split(':')
            server_list.append({
                'host': host,
                'port': int(port),
                'keyfile': cfg.CONF['gearman']['ssl_key'],
                'certfile': cfg.CONF['gearman']['ssl_cert'],
                'ca_certs': cfg.CONF['gearman']['ssl_ca'],
                'keepalive': cfg.CONF['gearman']['keepalive'],
                'keepcnt': cfg.CONF['gearman']['keepcnt'],
                'keepidle': cfg.CONF['gearman']['keepidle'],
                'keepintvl': cfg.CONF['gearman']['keepintvl']
            })
        self.gm_client = JSONGearmanClient(server_list)

    def send_pings(self, node_list):
        # TODO: lots of duplicated code that needs cleanup
        list_of_jobs = []
        failed_list = []
        node_status = dict()
        retry_list = []
        # The message name is STATS for historical reasons. Real
        # data statistics are gathered with METRICS messages.
        job_data = {"hpcs_action": "STATS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_pings = self.gm_client.submit_multiple_jobs(
            list_of_jobs,
            background=False,
            wait_until_complete=True,
            poll_timeout=self.poll_timeout)
        for ping in submitted_pings:
            if ping.state == JOB_UNKNOWN:
                # TODO: Gearman server failed, ignoring for now
                LOG.error('Gearman Job server fail')
                continue
            if ping.timed_out:
                # Ping timeout
                retry_list.append(ping.job.task)
                continue
            if ping.result['hpcs_response'] == 'FAIL':
                if ('status' in ping.result
                        and ping.result['status'] == 'DELETED'):
                    continue
                # Error returned by Gearman
                failed_list.append(ping.job.task)
                continue
            else:
                if 'nodes' in ping.result:
                    node_status[ping.job.task] = ping.result['nodes']

        list_of_jobs = []
        if len(retry_list) > 0:
            LOG.info("{0} pings timed out, retrying".format(len(retry_list)))
            for node in retry_list:
                list_of_jobs.append(dict(task=str(node), data=job_data))
            submitted_pings = self.gm_client.submit_multiple_jobs(
                list_of_jobs,
                background=False,
                wait_until_complete=True,
                poll_timeout=self.poll_retry)
            for ping in submitted_pings:
                if ping.state == JOB_UNKNOWN:
                    # TODO: Gearman server failed, ignoring for now
                    LOG.error('Gearman Job server fail')
                    continue
                if ping.timed_out:
                    # Ping timeout
                    failed_list.append(ping.job.task)
                    continue
                if ping.result['hpcs_response'] == 'FAIL':
                    if ('status' in ping.result
                            and ping.result['status'] == 'DELETED'):
                        continue
                    # Error returned by Gearman
                    failed_list.append(ping.job.task)
                    continue
                else:
                    if 'nodes' in ping.result:
                        node_status[ping.job.task] = ping.result['nodes']

        return failed_list, node_status

    def offline_check(self, node_list):
        list_of_jobs = []
        failed_list = []
        job_data = {"hpcs_action": "DIAGNOSTICS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_pings = self.gm_client.submit_multiple_jobs(
            list_of_jobs,
            background=False,
            wait_until_complete=True,
            poll_timeout=self.poll_timeout)
        for ping in submitted_pings:
            if ping.state == JOB_UNKNOWN:
                LOG.error(
                    "Gearman Job server failed during OFFLINE check of {0}".
                    format(ping.job.task))
            elif ping.timed_out:
                failed_list.append(ping.job.task)
            elif ping.result['network'] == 'FAIL':
                failed_list.append(ping.job.task)
            else:
                gearman_count = 0
                gearman_fail = 0
                for gearman_test in ping.result['gearman']:
                    gearman_count += 1
                    if gearman_test['status'] == 'FAIL':
                        gearman_fail += 1
                # Need 2/3rds gearman up
                max_fail_count = gearman_count / 3
                if gearman_fail > max_fail_count:
                    failed_list.append(ping.job.task)
        return failed_list

    def get_stats(self, node_list):
        # TODO: lots of duplicated code that needs cleanup
        list_of_jobs = []
        failed_list = []
        retry_list = []
        results = {}
        job_data = {"hpcs_action": "METRICS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_stats = self.gm_client.submit_multiple_jobs(
            list_of_jobs,
            background=False,
            wait_until_complete=True,
            poll_timeout=self.poll_timeout)
        for stats in submitted_stats:
            if stats.state == JOB_UNKNOWN:
                # TODO: Gearman server failed, ignoring for now
                retry_list.append(stats.job.task)
            elif stats.timed_out:
                # Timeout
                retry_list.append(stats.job.task)
            elif stats.result['hpcs_response'] == 'FAIL':
                # Error returned by Gearman
                failed_list.append(stats.job.task)
            else:
                #Success
                results[stats.job.task] = stats.result

        list_of_jobs = []
        if len(retry_list) > 0:
            LOG.info("{0} Statistics gathering timed out, retrying".format(
                len(retry_list)))
            for node in retry_list:
                list_of_jobs.append(dict(task=str(node), data=job_data))
            submitted_stats = self.gm_client.submit_multiple_jobs(
                list_of_jobs,
                background=False,
                wait_until_complete=True,
                poll_timeout=self.poll_retry)
            for stats in submitted_stats:
                if stats.state == JOB_UNKNOWN:
                    # TODO: Gearman server failed, ignoring for now
                    LOG.error("Gearman Job server failed gathering statistics "
                              "on {0}".format(stats.job.task))
                    failed_list.append(stats.job.task)
                elif stats.timed_out:
                    # Timeout
                    failed_list.append(stats.job.task)
                elif stats.result['hpcs_response'] == 'FAIL':
                    # Error returned by Gearman
                    failed_list.append(stats.job.task)
                else:
                    #Success
                    results[stats.job.task] = stats.result

        return failed_list, results
Example #13
0
class GearJobs(object):
    def __init__(self):
        self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout']
        self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry']

        server_list = []
        for server in cfg.CONF['gearman']['servers']:
            host, port = server.split(':')
            server_list.append({'host': host,
                                'port': int(port),
                                'keyfile': cfg.CONF['gearman']['ssl_key'],
                                'certfile': cfg.CONF['gearman']['ssl_cert'],
                                'ca_certs': cfg.CONF['gearman']['ssl_ca'],
                                'keepalive': cfg.CONF['gearman']['keepalive'],
                                'keepcnt': cfg.CONF['gearman']['keepcnt'],
                                'keepidle': cfg.CONF['gearman']['keepidle'],
                                'keepintvl': cfg.CONF['gearman']['keepintvl']
                                })
        self.gm_client = JSONGearmanClient(server_list)

    def send_pings(self, node_list):
        # TODO: lots of duplicated code that needs cleanup
        list_of_jobs = []
        failed_list = []
        node_status = dict()
        retry_list = []
        # The message name is STATS for historical reasons. Real
        # data statistics are gathered with METRICS messages.
        job_data = {"hpcs_action": "STATS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_pings = self.gm_client.submit_multiple_jobs(
            list_of_jobs, background=False, wait_until_complete=True,
            poll_timeout=self.poll_timeout
        )
        for ping in submitted_pings:
            if ping.state == JOB_UNKNOWN:
                # TODO: Gearman server failed, ignoring for now
                LOG.error('Gearman Job server fail')
                continue
            if ping.timed_out:
                # Ping timeout
                retry_list.append(ping.job.task)
                continue
            if ping.result['hpcs_response'] == 'FAIL':
                if (
                    'status' in ping.result and
                    ping.result['status'] == 'DELETED'
                ):
                    continue
                # Error returned by Gearman
                failed_list.append(ping.job.task)
                continue
            else:
                if 'nodes' in ping.result:
                    node_status[ping.job.task] = ping.result['nodes']

        list_of_jobs = []
        if len(retry_list) > 0:
            LOG.info(
                "{0} pings timed out, retrying".format(len(retry_list))
            )
            for node in retry_list:
                list_of_jobs.append(dict(task=str(node), data=job_data))
            submitted_pings = self.gm_client.submit_multiple_jobs(
                list_of_jobs, background=False, wait_until_complete=True,
                poll_timeout=self.poll_retry
            )
            for ping in submitted_pings:
                if ping.state == JOB_UNKNOWN:
                    # TODO: Gearman server failed, ignoring for now
                    LOG.error('Gearman Job server fail')
                    continue
                if ping.timed_out:
                    # Ping timeout
                    failed_list.append(ping.job.task)
                    continue
                if ping.result['hpcs_response'] == 'FAIL':
                    if (
                        'status' in ping.result and
                        ping.result['status'] == 'DELETED'
                    ):
                        continue
                    # Error returned by Gearman
                    failed_list.append(ping.job.task)
                    continue
                else:
                    if 'nodes' in ping.result:
                        node_status[ping.job.task] = ping.result['nodes']

        return failed_list, node_status

    def offline_check(self, node_list):
        list_of_jobs = []
        failed_list = []
        job_data = {"hpcs_action": "DIAGNOSTICS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_pings = self.gm_client.submit_multiple_jobs(
            list_of_jobs, background=False, wait_until_complete=True,
            poll_timeout=self.poll_timeout
        )
        for ping in submitted_pings:
            if ping.state == JOB_UNKNOWN:
                LOG.error(
                    "Gearman Job server failed during OFFLINE check of {0}".
                    format(ping.job.task)
                )
            elif ping.timed_out:
                failed_list.append(ping.job.task)
            elif ping.result['network'] == 'FAIL':
                failed_list.append(ping.job.task)
            else:
                gearman_count = 0
                gearman_fail = 0
                for gearman_test in ping.result['gearman']:
                    gearman_count += 1
                    if gearman_test['status'] == 'FAIL':
                        gearman_fail += 1
                # Need 2/3rds gearman up
                max_fail_count = gearman_count / 3
                if gearman_fail > max_fail_count:
                    failed_list.append(ping.job.task)
        return failed_list

    def get_stats(self, node_list):
        # TODO: lots of duplicated code that needs cleanup
        list_of_jobs = []
        failed_list = []
        retry_list = []
        results = {}
        job_data = {"hpcs_action": "METRICS"}
        for node in node_list:
            list_of_jobs.append(dict(task=str(node), data=job_data))
        submitted_stats = self.gm_client.submit_multiple_jobs(
            list_of_jobs, background=False, wait_until_complete=True,
            poll_timeout=self.poll_timeout
        )
        for stats in submitted_stats:
            if stats.state == JOB_UNKNOWN:
                # TODO: Gearman server failed, ignoring for now
                retry_list.append(stats.job.task)
            elif stats.timed_out:
                # Timeout
                retry_list.append(stats.job.task)
            elif stats.result['hpcs_response'] == 'FAIL':
                # Error returned by Gearman
                failed_list.append(stats.job.task)
            else:
                #Success
                results[stats.job.task] = stats.result

        list_of_jobs = []
        if len(retry_list) > 0:
            LOG.info(
                "{0} Statistics gathering timed out, retrying".
                format(len(retry_list))
            )
            for node in retry_list:
                list_of_jobs.append(dict(task=str(node), data=job_data))
            submitted_stats = self.gm_client.submit_multiple_jobs(
                list_of_jobs, background=False, wait_until_complete=True,
                poll_timeout=self.poll_retry
            )
            for stats in submitted_stats:
                if stats.state == JOB_UNKNOWN:
                    # TODO: Gearman server failed, ignoring for now
                    LOG.error(
                        "Gearman Job server failed gathering statistics "
                        "on {0}".format(stats.job.task)
                    )
                    failed_list.append(stats.job.task)
                elif stats.timed_out:
                    # Timeout
                    failed_list.append(stats.job.task)
                elif stats.result['hpcs_response'] == 'FAIL':
                    # Error returned by Gearman
                    failed_list.append(stats.job.task)
                else:
                    #Success
                    results[stats.job.task] = stats.result

        return failed_list, results
Example #14
0
 def __init__(self, logger, args):
     self.logger = logger
     self.gm_client = JSONGearmanClient(args.server)
Example #15
0
class GearmanWork(object):
    def __init__(self):
        server_list = []
        for server in cfg.CONF['gearman']['servers']:
            host, port = server.split(':')
            server_list.append({
                'host': host,
                'port': int(port),
                'keyfile': cfg.CONF['gearman']['ssl_key'],
                'certfile': cfg.CONF['gearman']['ssl_cert'],
                'ca_certs': cfg.CONF['gearman']['ssl_ca'],
                'keepalive': cfg.CONF['gearman']['keepalive'],
                'keepcnt': cfg.CONF['gearman']['keepcnt'],
                'keepidle': cfg.CONF['gearman']['keepidle'],
                'keepintvl': cfg.CONF['gearman']['keepintvl']
            })
        self.gearman_client = JSONGearmanClient(server_list)

    def send_delete_message(self, message):
        LOG.info("Sending %d gearman messages", len(message))
        job_status = self.gearman_client.submit_multiple_jobs(
            message,
            background=False,
            wait_until_complete=True,
            max_retries=10,
            poll_timeout=30.0)
        delete_count = 0
        for status in job_status:
            if status.state == JOB_UNKNOWN:
                LOG.error('Gearman Job server fail')
                continue
            if status.timed_out:
                LOG.error('Gearman timeout whilst deleting device')
                continue
            if status.result['response'] == 'FAIL':
                LOG.error(
                    'Pool manager failed to delete a device, removing from DB')

            delete_count += 1
            with db_session() as session:
                session.query(Device).\
                    filter(Device.name == status.result['name']).delete()
                session.commit()

        LOG.info('%d freed devices delete from pool', delete_count)

    def send_vips_message(self, message):
        # TODO: make this gearman part more async, not wait for all builds
        LOG.info("Sending %d gearman messages", len(message))
        job_status = self.gearman_client.submit_multiple_jobs(
            message,
            background=False,
            wait_until_complete=True,
            max_retries=10,
            poll_timeout=3600.0)
        built_count = 0
        for status in job_status:
            if status.state == JOB_UNKNOWN:
                LOG.error('Gearman Job server fail')
                continue
            if status.timed_out:
                LOG.error('Gearman timeout whilst building vip')
                continue
            if status.result['response'] == 'FAIL':
                LOG.error('Pool manager failed to build a vip')
                continue

            built_count += 1
            try:
                self._add_vip(status.result)
            except:
                LOG.exception('Could not add vip to DB, node data: {0}'.format(
                    status.result))
        LOG.info(
            '{vips} vips built and added to pool'.format(vips=built_count))

    def send_create_message(self, message):
        # TODO: make this gearman part more async, not wait for all builds
        LOG.info("Sending {0} gearman messages".format(len(message)))
        job_status = self.gearman_client.submit_multiple_jobs(
            message,
            background=False,
            wait_until_complete=True,
            max_retries=10,
            poll_timeout=3600.0)
        built_count = 0
        for status in job_status:
            if status.state == JOB_UNKNOWN:
                LOG.error('Gearman Job server fail')
                continue
            if status.timed_out:
                LOG.error('Gearman timeout whilst building device')
                continue
            if status.result['response'] == 'FAIL':
                LOG.error('Pool manager failed to build a device')
                if 'name' in status.result:
                    self._add_bad_node(status.result)
                continue

            built_count += 1
            try:
                self._add_node(status.result)
            except:
                LOG.exception(
                    'Could not add node to DB, node data: {0}'.format(
                        status.result))
        LOG.info('{nodes} devices built and added to pool'.format(
            nodes=built_count))

    def _add_vip(self, data):
        LOG.info('Adding vip {0} to DB'.format(data['ip']))
        vip = Vip()
        vip.ip = int(ipaddress.IPv4Address(unicode(data['ip'])))
        with db_session() as session:
            session.add(vip)
            session.commit()

    def _add_node(self, data):
        LOG.info('Adding device {0} to DB'.format(data['name']))
        device = Device()
        device.name = data['name']
        device.publicIpAddr = data['addr']
        # TODO: kill this field, make things use publicIpAddr instead
        device.floatingIpAddr = data['addr']
        device.az = data['az']
        device.type = data['type']
        device.pingCount = 0
        device.status = 'OFFLINE'
        device.created = None
        with db_session() as session:
            session.add(device)
            session.commit()

    def _add_bad_node(self, data):
        LOG.info('Adding bad device {0} to DB to be deleted'.format(
            data['name']))
        device = Device()
        device.name = data['name']
        device.publicIpAddr = data['addr']
        # TODO: kill this field, make things use publicIpAddr instead
        device.floatingIpAddr = data['addr']
        device.az = data['az']
        device.type = data['type']
        device.pingCount = 0
        device.status = 'DELETED'
        device.created = None
        with db_session() as session:
            session.add(device)
            session.commit()