class GearJobs(object): def __init__(self, logger, args): self.logger = logger self.gm_client = JSONGearmanClient(args.server) def send_pings(self, node_list): list_of_jobs = [] failed_list = [] job_data = {"hpcs_action": "STATS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=5.0 ) for ping in submitted_pings: if ping.state == 'UNKNOWN': # TODO: Gearman server failed, ignoring for now self.logger.error('Gearman Job server fail') continue if ping.timed_out: # Ping timeout failed_list.append(ping.job.task) continue if ping.result['hpcs_response'] == 'FAIL': # Error returned by Gearman failed_list.append(ping.job.task) continue return failed_list def send_repair(self, node_list): list_of_jobs = [] repaired_list = [] job_data = {"hpcs_action": "STATS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=5.0 ) for ping in submitted_pings: if ping.state == 'UNKNOWN': # TODO: Gearman server failed, ignoring for now self.logger.error('Gearman Job server fail') continue elif ping.timed_out: # Ping timeout continue elif ping.result['hpcs_response'] == 'FAIL': # Error returned by Gearman continue else: repaired_list.append(ping.job.task) return repaired_list
def _test_node(self, name): """ Run diags on node, blow it away if bad """ server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl']}) gm_client = JSONGearmanClient(server_list) job_data = {'hpcs_action': 'DIAGNOSTICS'} job_status = gm_client.submit_job( str(name), job_data, background=False, wait_until_complete=True, max_retries=10, poll_timeout=10 ) if job_status.state == JOB_UNKNOWN: # Gearman server connect fail, count as bad node because we can't # tell if it really is working LOG.error('Could not talk to gearman server') return False if job_status.timed_out: LOG.warning('Timeout getting diags from {0}'.format(name)) return False LOG.debug(job_status.result) # Would only happen if DIAGNOSTICS call not supported if job_status.result['hpcs_response'] == 'FAIL': return True if job_status.result['network'] == 'FAIL': return False gearman_count = 0 gearman_fail = 0 for gearman_test in job_status.result['gearman']: gearman_count += 1 if gearman_test['status'] == 'FAIL': LOG.info( 'Device {0} cannot talk to gearman {1}' .format(name, gearman_test['host']) ) gearman_fail += 1 # Need 2/3rds gearman up max_fail_count = gearman_count / 3 if gearman_fail > max_fail_count: return False return True
def __init__(self): server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({ 'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl'] }) self.gearman_client = JSONGearmanClient(server_list)
def __init__(self, host, lbid): self.host = host self.lbid = lbid server_list = [] for server in conf.gearman.server: ghost, gport = server.split(':') server_list.append({ 'host': ghost, 'port': int(gport), 'keyfile': conf.gearman.ssl_key, 'certfile': conf.gearman.ssl_cert, 'ca_certs': conf.gearman.ssl_ca, 'keepalive': conf.gearman.keepalive, 'keepcnt': conf.gearman.keepcnt, 'keepidle': conf.gearman.keepidle, 'keepintvl': conf.gearman.keepintvl }) self.gearman_client = JSONGearmanClient(server_list)
def __init__(self): self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout'] self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry'] server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({ 'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl'] }) self.gm_client = JSONGearmanClient(server_list)
def __init__(self): server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl'] }) self.gearman_client = JSONGearmanClient(server_list)
def __init__(self, host, lbid): self.host = host self.lbid = lbid server_list = [] for server in conf.gearman.server: ghost, gport = server.split(':') server_list.append({'host': ghost, 'port': int(gport), 'keyfile': conf.gearman.ssl_key, 'certfile': conf.gearman.ssl_cert, 'ca_certs': conf.gearman.ssl_ca, 'keepalive': conf.gearman.keepalive, 'keepcnt': conf.gearman.keepcnt, 'keepidle': conf.gearman.keepidle, 'keepintvl': conf.gearman.keepintvl}) self.gearman_client = JSONGearmanClient(server_list)
def __init__(self): self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout'] self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry'] server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl'] }) self.gm_client = JSONGearmanClient(server_list)
class GearmanClientThread(object): def __init__(self, host, lbid): self.host = host self.lbid = lbid server_list = [] for server in conf.gearman.server: ghost, gport = server.split(':') server_list.append({'host': ghost, 'port': int(gport), 'keyfile': conf.gearman.ssl_key, 'certfile': conf.gearman.ssl_cert, 'ca_certs': conf.gearman.ssl_ca, 'keepalive': conf.gearman.keepalive, 'keepcnt': conf.gearman.keepcnt, 'keepidle': conf.gearman.keepidle, 'keepintvl': conf.gearman.keepintvl}) self.gearman_client = JSONGearmanClient(server_list) def send_assign(self, data): NULL = None # For pep8 with db_session() as session: device = session.query(Device).\ filter(Device.name == data).first() if device is None: LOG.error( "VIP assign have been given non existent device {0}" .format(data) ) session.rollback() return False if not self.lbid: vip = session.query(Vip).\ filter(Vip.device == NULL).\ with_lockmode('update').\ first() if vip is None: errmsg = 'Floating IP assign failed (none available)' LOG.error( "Failed to assign IP to device {0} (none available)" .format(data) ) self._set_error(device.id, errmsg, session) session.commit() return False else: vip = session.query(Vip).\ filter(Vip.id == self.lbid).first() if vip is None: errmsg = 'Cannot find existing floating IP' LOG.error( "Failed to assign IP to device {0}" .format(data) ) self._set_error(device.id, errmsg, session) session.commit() return False vip.device = device.id vip_id = vip.id vip_ip = vip.ip session.commit() ip_str = str(ipaddress.IPv4Address(vip_ip)) job_data = { 'action': 'ASSIGN_IP', 'name': data, 'ip': ip_str } status, response = self._send_message(job_data, 'response') if status: return True elif self.lbid: LOG.error( "Failed to assign IP {0} to device {1}" .format(ip_str, data) ) else: LOG.error( "Failed to assign IP {0} to device {1}" .format(ip_str, data) ) # set to device 0 to make sure it won't be used again with db_session() as session: vip = session.query(Vip).filter(Vip.id == vip_id).first() vip.device = 0 session.commit() submit_vip_job('REMOVE', None, ip_str) return False def send_remove(self, data=None): job_data = { 'action': 'DELETE_IP', 'ip': self.lbid } ip_int = int(ipaddress.IPv4Address(unicode(self.lbid))) for x in xrange(0, 5): LOG.info( 'Attempt to delete IP {0} #{1}' .format(self.lbid, x) ) status, response = self._send_message(job_data, 'response') if status: break with db_session() as session: if not status: LOG.error( "Failed to delete IP {0}" .format(self.lbid) ) # Set to 0 to mark as something that needs cleaning up # but cannot be used again vip = session.query(Vip).\ filter(Vip.ip == ip_int).first() vip.device = 0 else: session.query(Vip).\ filter(Vip.ip == ip_int).delete() counter = session.query(Counters).\ filter(Counters.name == 'vips_deleted').first() counter.value += 1 session.commit() def send_delete(self, data): with db_session() as session: count = session.query( LoadBalancer ).join(LoadBalancer.devices).\ filter(Device.id == data).\ filter(LoadBalancer.id != self.lbid).\ filter(LoadBalancer.status != 'DELETED').\ filter(LoadBalancer.status != 'PENDING_DELETE').\ count() if count >= 1: # This is an update message because we want to retain the # remaining LB keep_lb = session.query(LoadBalancer).\ join(LoadBalancer.nodes).\ join(LoadBalancer.devices).\ filter(Device.id == data).\ filter(LoadBalancer.id != self.lbid).\ filter(LoadBalancer.status != 'DELETED').\ filter(LoadBalancer.status != 'PENDING_DELETE').\ first() job_data = { 'hpcs_action': 'UPDATE', 'loadBalancers': [{ 'name': keep_lb.name, 'protocol': keep_lb.protocol, 'algorithm': keep_lb.algorithm, 'port': keep_lb.port, 'nodes': [] }] } for node in keep_lb.nodes: if not node.enabled: continue condition = 'ENABLED' node_data = { 'id': node.id, 'port': node.port, 'address': node.address, 'weight': node.weight, 'condition': condition } job_data['loadBalancers'][0]['nodes'].append(node_data) else: # This is a delete dev = session.query(Device.name).\ filter(Device.id == data).first() vip = session.query(Vip).\ filter(Vip.device == data).first() if vip: submit_vip_job( 'REMOVE', dev.name, str(ipaddress.IPv4Address(vip.ip)) ) job_data = {"hpcs_action": "DELETE"} status, response = self._send_message(job_data, 'hpcs_response') lb = session.query(LoadBalancer).\ filter(LoadBalancer.id == self.lbid).\ first() if not status: LOG.error( "Failed Gearman delete for LB {0}".format(lb.id) ) self._set_error(data, response, session) lb.status = 'DELETED' tenant_id = lb.tenantid if count == 0: # Device should never be used again device = session.query(Device).\ filter(Device.id == data).first() device.status = 'DELETED' # Remove LB-device join session.execute(loadbalancers_devices.delete().where( loadbalancers_devices.c.loadbalancer == lb.id )) session.query(Node).\ filter(Node.lbid == lb.id).delete() session.query(HealthMonitor).\ filter(HealthMonitor.lbid == lb.id).delete() counter = session.query(Counters).\ filter(Counters.name == 'loadbalancers_deleted').first() counter.value += 1 session.commit() # Notify billing of the LB deletion update_mnb('lbaas.instance.delete', self.lbid, tenant_id) def _set_error(self, device_id, errmsg, session): lbs = session.query( LoadBalancer ).join(LoadBalancer.nodes).\ join(LoadBalancer.devices).\ filter(Device.id == device_id).\ filter(LoadBalancer.status != 'DELETED').\ all() device = session.query(Device).\ filter(Device.id == device_id).\ first() if device is None: # Device already deleted, probably a race between the OFFLINE check # and auto-failover return device.status = 'ERROR' counter = session.query(Counters).\ filter(Counters.name == 'loadbalancers_error').first() for lb in lbs: counter.value += 1 lb.status = 'ERROR' lb.errmsg = errmsg def send_archive(self, data): with db_session() as session: lb = session.query(LoadBalancer).\ filter(LoadBalancer.id == self.lbid).\ first() job_data = { 'hpcs_action': 'ARCHIVE', 'hpcs_object_store_basepath': data['objectStoreBasePath'], 'hpcs_object_store_endpoint': data['objectStoreEndpoint'], 'hpcs_object_store_token': data['authToken'], 'hpcs_object_store_type': data['objectStoreType'], 'loadBalancers': [{ 'id': str(lb.id), 'name': lb.name, 'protocol': lb.protocol }] } status, response = self._send_message(job_data, 'hpcs_response') device = session.query(Device).\ filter(Device.id == data['deviceid']).\ first() if status: device.errmsg = 'Log archive successful' else: device.errmsg = 'Log archive failed: {0}'.format(response) lb.status = 'ACTIVE' counter = session.query(Counters).\ filter(Counters.name == 'log_archives').first() counter.value += 1 session.commit() def send_update(self, data): with db_session() as session: lbs = session.query( LoadBalancer ).join(LoadBalancer.nodes).\ join(LoadBalancer.devices).\ filter(Device.id == data).\ filter(LoadBalancer.status != 'DELETED').\ all() job_data = { 'hpcs_action': 'UPDATE', 'loadBalancers': [] } degraded = [] if lbs is None: LOG.error( 'Attempting to send empty LB data for device {0} ({1}), ' 'something went wrong'.format(data, self.host) ) self._set_error(data, "LB config error", session) session.commit() return for lb in lbs: lb_data = { 'name': lb.name, 'protocol': lb.protocol, 'algorithm': lb.algorithm, 'port': lb.port, 'nodes': [], 'monitor': {} } for node in lb.nodes: if not node.enabled: continue condition = 'ENABLED' backup = 'FALSE' if node.backup != 0: backup = 'TRUE' node_data = { 'id': node.id, 'port': node.port, 'address': node.address, 'weight': node.weight, 'condition': condition, 'backup': backup } lb_data['nodes'].append(node_data) # Track if we have a DEGRADED LB if node.status == 'ERROR': degraded.append(lb.id) # Add a default health monitor if one does not exist monitor = session.query(HealthMonitor).\ filter(HealthMonitor.lbid == lb.id).first() if monitor is None: # Set it to a default configuration monitor = HealthMonitor( lbid=lb.id, type="CONNECT", delay=30, timeout=30, attempts=2, path=None ) session.add(monitor) session.flush() monitor_data = { 'type': monitor.type, 'delay': monitor.delay, 'timeout': monitor.timeout, 'attempts': monitor.attempts } if monitor.path is not None: monitor_data['path'] = monitor.path # All new LBs created since these options were supported # will have default values in the DB. Pre-existing LBs will # not have any values, so we need to check for that. if any([lb.timeout, lb.retries]): lb_data['options'] = { 'client_timeout': lb.timeout, 'server_timeout': lb.timeout, 'connect_timeout': lb.timeout, 'connect_retries': lb.retries } lb_data['monitor'] = monitor_data job_data['loadBalancers'].append(lb_data) # Update the worker mnb_data = {} status, response = self._send_message(job_data, 'hpcs_response') if not status: self._set_error(data, response, session) else: for lb in lbs: if lb.id in degraded: lb.status = 'DEGRADED' lb.errmsg = "A node on the load balancer has failed" elif lb.status == 'ERROR': # Do nothing because something else failed in the mean # time pass elif lb.status == 'BUILD': # Do nothing if a new device, stay in BUILD state until # floating IP assign finishes if len(lbs) > 1: lb.status = 'ACTIVE' if lb.id == self.lbid: # This is the new LB being added to a device. # We don't have to assign a vip so we can # notify billing of the LB creation (once the # DB is updated) mnb_data["lbid"] = lb.id mnb_data["tenantid"] = lb.tenantid else: lb.status = 'ACTIVE' lb.errmsg = None device = session.query(Device).\ filter(Device.id == data).\ first() if device is None: # Shouldn't hit here, but just to be safe session.commit() return if device.status == 'BUILD' and len(lbs) > 1: device.status = 'ONLINE' device_name = device.name device_status = device.status counter = session.query(Counters).\ filter(Counters.name == 'loadbalancers_updated').first() counter.value += 1 session.commit() if device_status == 'BUILD': submit_vip_job( 'ASSIGN', device_name, None ) # Send the MnB create if needed if "lbid" in mnb_data: update_mnb('lbaas.instance.create', mnb_data["lbid"], mnb_data["tenantid"]) def _send_message(self, message, response_name): job_status = self.gearman_client.submit_job( self.host, message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=120.0 ) if job_status.state == 'UNKNOWN': # Gearman server connection failed LOG.error('Could not talk to gearman server') return False, "System error communicating with load balancer" if job_status.timed_out: # Job timed out LOG.warning( 'Gearman timeout talking to {0}'.format(self.host) ) return False, "Timeout error communicating with load balancer" LOG.debug(job_status.result) if 'badRequest' in job_status.result: error = job_status.result['badRequest']['validationErrors'] return False, error['message'] if job_status.result[response_name] == 'FAIL': # Worker says 'no' if 'hpcs_error' in job_status.result: error = job_status.result['hpcs_error'] else: error = 'Load Balancer error' LOG.error( 'Gearman error response from {0}: {1}'.format(self.host, error) ) return False, error LOG.info('Gearman success from {0}'.format(self.host)) return True, job_status.result
class GearmanWork(object): def __init__(self): server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl'] }) self.gearman_client = JSONGearmanClient(server_list) def send_delete_message(self, message): LOG.info("Sending %d gearman messages", len(message)) job_status = self.gearman_client.submit_multiple_jobs( message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=30.0 ) delete_count = 0 for status in job_status: if status.state == JOB_UNKNOWN: LOG.error('Gearman Job server fail') continue if status.timed_out: LOG.error('Gearman timeout whilst deleting device') continue if status.result['response'] == 'FAIL': LOG.error( 'Pool manager failed to delete a device, removing from DB' ) delete_count += 1 with db_session() as session: session.query(Device).\ filter(Device.name == status.result['name']).delete() session.commit() LOG.info('%d freed devices delete from pool', delete_count) def send_vips_message(self, message): # TODO: make this gearman part more async, not wait for all builds LOG.info("Sending %d gearman messages", len(message)) job_status = self.gearman_client.submit_multiple_jobs( message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=3600.0 ) built_count = 0 for status in job_status: if status.state == JOB_UNKNOWN: LOG.error('Gearman Job server fail') continue if status.timed_out: LOG.error('Gearman timeout whilst building vip') continue if status.result['response'] == 'FAIL': LOG.error('Pool manager failed to build a vip') continue built_count += 1 try: self._add_vip(status.result) except: LOG.exception( 'Could not add vip to DB, node data: {0}' .format(status.result) ) LOG.info( '{vips} vips built and added to pool'.format(vips=built_count) ) def send_create_message(self, message): # TODO: make this gearman part more async, not wait for all builds LOG.info("Sending {0} gearman messages".format(len(message))) job_status = self.gearman_client.submit_multiple_jobs( message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=3600.0 ) built_count = 0 for status in job_status: if status.state == JOB_UNKNOWN: LOG.error('Gearman Job server fail') continue if status.timed_out: LOG.error('Gearman timeout whilst building device') continue if status.result['response'] == 'FAIL': LOG.error('Pool manager failed to build a device') if 'name' in status.result: self._add_bad_node(status.result) continue built_count += 1 try: self._add_node(status.result) except: LOG.exception( 'Could not add node to DB, node data: {0}' .format(status.result) ) LOG.info( '{nodes} devices built and added to pool'.format(nodes=built_count) ) def _add_vip(self, data): LOG.info('Adding vip {0} to DB'.format(data['ip'])) vip = Vip() vip.ip = int(ipaddress.IPv4Address(unicode(data['ip']))) with db_session() as session: session.add(vip) session.commit() def _add_node(self, data): LOG.info('Adding device {0} to DB'.format(data['name'])) device = Device() device.name = data['name'] device.publicIpAddr = data['addr'] # TODO: kill this field, make things use publicIpAddr instead device.floatingIpAddr = data['addr'] device.az = data['az'] device.type = data['type'] device.pingCount = 0 device.status = 'OFFLINE' device.created = None with db_session() as session: session.add(device) session.commit() def _add_bad_node(self, data): LOG.info( 'Adding bad device {0} to DB to be deleted'.format(data['name']) ) device = Device() device.name = data['name'] device.publicIpAddr = data['addr'] # TODO: kill this field, make things use publicIpAddr instead device.floatingIpAddr = data['addr'] device.az = data['az'] device.type = data['type'] device.pingCount = 0 device.status = 'DELETED' device.created = None with db_session() as session: session.add(device) session.commit()
class GearmanClientThread(object): def __init__(self, host, lbid): self.host = host self.lbid = lbid server_list = [] for server in conf.gearman.server: ghost, gport = server.split(':') server_list.append({ 'host': ghost, 'port': int(gport), 'keyfile': conf.gearman.ssl_key, 'certfile': conf.gearman.ssl_cert, 'ca_certs': conf.gearman.ssl_ca, 'keepalive': conf.gearman.keepalive, 'keepcnt': conf.gearman.keepcnt, 'keepidle': conf.gearman.keepidle, 'keepintvl': conf.gearman.keepintvl }) self.gearman_client = JSONGearmanClient(server_list) def send_assign(self, data): NULL = None # For pep8 with db_session() as session: device = session.query(Device).\ filter(Device.name == data).first() if device is None: LOG.error("VIP assign have been given non existent device {0}". format(data)) session.rollback() return False if not self.lbid: vip = session.query(Vip).\ filter(Vip.device == NULL).\ with_lockmode('update').\ first() if vip is None: errmsg = 'Floating IP assign failed (none available)' LOG.error( "Failed to assign IP to device {0} (none available)". format(data)) self._set_error(device.id, errmsg, session) session.commit() return False else: vip = session.query(Vip).\ filter(Vip.id == self.lbid).first() if vip is None: errmsg = 'Cannot find existing floating IP' LOG.error("Failed to assign IP to device {0}".format(data)) self._set_error(device.id, errmsg, session) session.commit() return False vip.device = device.id vip_id = vip.id vip_ip = vip.ip session.commit() ip_str = str(ipaddress.IPv4Address(vip_ip)) job_data = {'action': 'ASSIGN_IP', 'name': data, 'ip': ip_str} status, response = self._send_message(job_data, 'response') if status: return True elif self.lbid: LOG.error("Failed to assign IP {0} to device {1}".format( ip_str, data)) else: LOG.error("Failed to assign IP {0} to device {1}".format( ip_str, data)) # set to device 0 to make sure it won't be used again with db_session() as session: vip = session.query(Vip).filter(Vip.id == vip_id).first() vip.device = 0 session.commit() submit_vip_job('REMOVE', None, ip_str) return False def send_remove(self, data=None): job_data = {'action': 'DELETE_IP', 'ip': self.lbid} ip_int = int(ipaddress.IPv4Address(unicode(self.lbid))) for x in xrange(0, 5): LOG.info('Attempt to delete IP {0} #{1}'.format(self.lbid, x)) status, response = self._send_message(job_data, 'response') if status: break with db_session() as session: if not status: LOG.error("Failed to delete IP {0}".format(self.lbid)) # Set to 0 to mark as something that needs cleaning up # but cannot be used again vip = session.query(Vip).\ filter(Vip.ip == ip_int).first() vip.device = 0 else: session.query(Vip).\ filter(Vip.ip == ip_int).delete() counter = session.query(Counters).\ filter(Counters.name == 'vips_deleted').first() counter.value += 1 session.commit() def send_delete(self, data): with db_session() as session: count = session.query( LoadBalancer ).join(LoadBalancer.devices).\ filter(Device.id == data).\ filter(LoadBalancer.id != self.lbid).\ filter(LoadBalancer.status != 'DELETED').\ filter(LoadBalancer.status != 'PENDING_DELETE').\ count() if count >= 1: # This is an update message because we want to retain the # remaining LB keep_lb = session.query(LoadBalancer).\ join(LoadBalancer.nodes).\ join(LoadBalancer.devices).\ filter(Device.id == data).\ filter(LoadBalancer.id != self.lbid).\ filter(LoadBalancer.status != 'DELETED').\ filter(LoadBalancer.status != 'PENDING_DELETE').\ first() job_data = { 'hpcs_action': 'UPDATE', 'loadBalancers': [{ 'name': keep_lb.name, 'protocol': keep_lb.protocol, 'algorithm': keep_lb.algorithm, 'port': keep_lb.port, 'nodes': [] }] } for node in keep_lb.nodes: if not node.enabled: continue condition = 'ENABLED' node_data = { 'id': node.id, 'port': node.port, 'address': node.address, 'weight': node.weight, 'condition': condition } job_data['loadBalancers'][0]['nodes'].append(node_data) else: # This is a delete dev = session.query(Device.name).\ filter(Device.id == data).first() vip = session.query(Vip).\ filter(Vip.device == data).first() if vip: submit_vip_job('REMOVE', dev.name, str(ipaddress.IPv4Address(vip.ip))) job_data = {"hpcs_action": "DELETE"} status, response = self._send_message(job_data, 'hpcs_response') lb = session.query(LoadBalancer).\ filter(LoadBalancer.id == self.lbid).\ first() if not status: LOG.error("Failed Gearman delete for LB {0}".format(lb.id)) self._set_error(data, response, session) lb.status = 'DELETED' tenant_id = lb.tenantid if count == 0: # Device should never be used again device = session.query(Device).\ filter(Device.id == data).first() device.status = 'DELETED' # Remove LB-device join session.execute(loadbalancers_devices.delete().where( loadbalancers_devices.c.loadbalancer == lb.id)) session.query(Node).\ filter(Node.lbid == lb.id).delete() session.query(HealthMonitor).\ filter(HealthMonitor.lbid == lb.id).delete() counter = session.query(Counters).\ filter(Counters.name == 'loadbalancers_deleted').first() counter.value += 1 session.commit() # Notify billing of the LB deletion update_mnb('lbaas.instance.delete', self.lbid, tenant_id) def _set_error(self, device_id, errmsg, session): lbs = session.query( LoadBalancer ).join(LoadBalancer.nodes).\ join(LoadBalancer.devices).\ filter(Device.id == device_id).\ filter(LoadBalancer.status != 'DELETED').\ all() device = session.query(Device).\ filter(Device.id == device_id).\ first() if device is None: # Device already deleted, probably a race between the OFFLINE check # and auto-failover return device.status = 'ERROR' counter = session.query(Counters).\ filter(Counters.name == 'loadbalancers_error').first() for lb in lbs: counter.value += 1 lb.status = 'ERROR' lb.errmsg = errmsg def send_archive(self, data): with db_session() as session: lb = session.query(LoadBalancer).\ filter(LoadBalancer.id == self.lbid).\ first() job_data = { 'hpcs_action': 'ARCHIVE', 'hpcs_object_store_basepath': data['objectStoreBasePath'], 'hpcs_object_store_endpoint': data['objectStoreEndpoint'], 'hpcs_object_store_token': data['authToken'], 'hpcs_object_store_type': data['objectStoreType'], 'loadBalancers': [{ 'id': str(lb.id), 'name': lb.name, 'protocol': lb.protocol }] } status, response = self._send_message(job_data, 'hpcs_response') device = session.query(Device).\ filter(Device.id == data['deviceid']).\ first() if status: device.errmsg = 'Log archive successful' else: device.errmsg = 'Log archive failed: {0}'.format(response) lb.status = 'ACTIVE' counter = session.query(Counters).\ filter(Counters.name == 'log_archives').first() counter.value += 1 session.commit() def send_update(self, data): with db_session() as session: lbs = session.query( LoadBalancer ).join(LoadBalancer.nodes).\ join(LoadBalancer.devices).\ filter(Device.id == data).\ filter(LoadBalancer.status != 'DELETED').\ all() job_data = {'hpcs_action': 'UPDATE', 'loadBalancers': []} degraded = [] if lbs is None: LOG.error( 'Attempting to send empty LB data for device {0} ({1}), ' 'something went wrong'.format(data, self.host)) self._set_error(data, "LB config error", session) session.commit() return for lb in lbs: lb_data = { 'name': lb.name, 'protocol': lb.protocol, 'algorithm': lb.algorithm, 'port': lb.port, 'nodes': [], 'monitor': {} } for node in lb.nodes: if not node.enabled: continue condition = 'ENABLED' backup = 'FALSE' if node.backup != 0: backup = 'TRUE' node_data = { 'id': node.id, 'port': node.port, 'address': node.address, 'weight': node.weight, 'condition': condition, 'backup': backup } lb_data['nodes'].append(node_data) # Track if we have a DEGRADED LB if node.status == 'ERROR': degraded.append(lb.id) # Add a default health monitor if one does not exist monitor = session.query(HealthMonitor).\ filter(HealthMonitor.lbid == lb.id).first() if monitor is None: # Set it to a default configuration monitor = HealthMonitor(lbid=lb.id, type="CONNECT", delay=30, timeout=30, attempts=2, path=None) session.add(monitor) session.flush() monitor_data = { 'type': monitor.type, 'delay': monitor.delay, 'timeout': monitor.timeout, 'attempts': monitor.attempts } if monitor.path is not None: monitor_data['path'] = monitor.path # All new LBs created since these options were supported # will have default values in the DB. Pre-existing LBs will # not have any values, so we need to check for that. if any([lb.timeout, lb.retries]): lb_data['options'] = { 'client_timeout': lb.timeout, 'server_timeout': lb.timeout, 'connect_timeout': lb.timeout, 'connect_retries': lb.retries } lb_data['monitor'] = monitor_data job_data['loadBalancers'].append(lb_data) # Update the worker mnb_data = {} status, response = self._send_message(job_data, 'hpcs_response') if not status: self._set_error(data, response, session) else: for lb in lbs: if lb.id in degraded: lb.status = 'DEGRADED' lb.errmsg = "A node on the load balancer has failed" elif lb.status == 'ERROR': # Do nothing because something else failed in the mean # time pass elif lb.status == 'BUILD': # Do nothing if a new device, stay in BUILD state until # floating IP assign finishes if len(lbs) > 1: lb.status = 'ACTIVE' if lb.id == self.lbid: # This is the new LB being added to a device. # We don't have to assign a vip so we can # notify billing of the LB creation (once the # DB is updated) mnb_data["lbid"] = lb.id mnb_data["tenantid"] = lb.tenantid else: lb.status = 'ACTIVE' lb.errmsg = None device = session.query(Device).\ filter(Device.id == data).\ first() if device is None: # Shouldn't hit here, but just to be safe session.commit() return if device.status == 'BUILD' and len(lbs) > 1: device.status = 'ONLINE' device_name = device.name device_status = device.status counter = session.query(Counters).\ filter(Counters.name == 'loadbalancers_updated').first() counter.value += 1 session.commit() if device_status == 'BUILD': submit_vip_job('ASSIGN', device_name, None) # Send the MnB create if needed if "lbid" in mnb_data: update_mnb('lbaas.instance.create', mnb_data["lbid"], mnb_data["tenantid"]) def _send_message(self, message, response_name): job_status = self.gearman_client.submit_job(self.host, message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=120.0) if job_status.state == 'UNKNOWN': # Gearman server connection failed LOG.error('Could not talk to gearman server') return False, "System error communicating with load balancer" if job_status.timed_out: # Job timed out LOG.warning('Gearman timeout talking to {0}'.format(self.host)) return False, "Timeout error communicating with load balancer" LOG.debug(job_status.result) if 'badRequest' in job_status.result: error = job_status.result['badRequest']['validationErrors'] return False, error['message'] if job_status.result[response_name] == 'FAIL': # Worker says 'no' if 'hpcs_error' in job_status.result: error = job_status.result['hpcs_error'] else: error = 'Load Balancer error' LOG.error('Gearman error response from {0}: {1}'.format( self.host, error)) return False, error LOG.info('Gearman success from {0}'.format(self.host)) return True, job_status.result
class GearJobs(object): def __init__(self): self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout'] self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry'] server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({ 'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl'] }) self.gm_client = JSONGearmanClient(server_list) def send_pings(self, node_list): # TODO: lots of duplicated code that needs cleanup list_of_jobs = [] failed_list = [] node_status = dict() retry_list = [] # The message name is STATS for historical reasons. Real # data statistics are gathered with METRICS messages. job_data = {"hpcs_action": "STATS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_timeout) for ping in submitted_pings: if ping.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now LOG.error('Gearman Job server fail') continue if ping.timed_out: # Ping timeout retry_list.append(ping.job.task) continue if ping.result['hpcs_response'] == 'FAIL': if ('status' in ping.result and ping.result['status'] == 'DELETED'): continue # Error returned by Gearman failed_list.append(ping.job.task) continue else: if 'nodes' in ping.result: node_status[ping.job.task] = ping.result['nodes'] list_of_jobs = [] if len(retry_list) > 0: LOG.info("{0} pings timed out, retrying".format(len(retry_list))) for node in retry_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_retry) for ping in submitted_pings: if ping.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now LOG.error('Gearman Job server fail') continue if ping.timed_out: # Ping timeout failed_list.append(ping.job.task) continue if ping.result['hpcs_response'] == 'FAIL': if ('status' in ping.result and ping.result['status'] == 'DELETED'): continue # Error returned by Gearman failed_list.append(ping.job.task) continue else: if 'nodes' in ping.result: node_status[ping.job.task] = ping.result['nodes'] return failed_list, node_status def offline_check(self, node_list): list_of_jobs = [] failed_list = [] job_data = {"hpcs_action": "DIAGNOSTICS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_timeout) for ping in submitted_pings: if ping.state == JOB_UNKNOWN: LOG.error( "Gearman Job server failed during OFFLINE check of {0}". format(ping.job.task)) elif ping.timed_out: failed_list.append(ping.job.task) elif ping.result['network'] == 'FAIL': failed_list.append(ping.job.task) else: gearman_count = 0 gearman_fail = 0 for gearman_test in ping.result['gearman']: gearman_count += 1 if gearman_test['status'] == 'FAIL': gearman_fail += 1 # Need 2/3rds gearman up max_fail_count = gearman_count / 3 if gearman_fail > max_fail_count: failed_list.append(ping.job.task) return failed_list def get_stats(self, node_list): # TODO: lots of duplicated code that needs cleanup list_of_jobs = [] failed_list = [] retry_list = [] results = {} job_data = {"hpcs_action": "METRICS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_stats = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_timeout) for stats in submitted_stats: if stats.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now retry_list.append(stats.job.task) elif stats.timed_out: # Timeout retry_list.append(stats.job.task) elif stats.result['hpcs_response'] == 'FAIL': # Error returned by Gearman failed_list.append(stats.job.task) else: #Success results[stats.job.task] = stats.result list_of_jobs = [] if len(retry_list) > 0: LOG.info("{0} Statistics gathering timed out, retrying".format( len(retry_list))) for node in retry_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_stats = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_retry) for stats in submitted_stats: if stats.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now LOG.error("Gearman Job server failed gathering statistics " "on {0}".format(stats.job.task)) failed_list.append(stats.job.task) elif stats.timed_out: # Timeout failed_list.append(stats.job.task) elif stats.result['hpcs_response'] == 'FAIL': # Error returned by Gearman failed_list.append(stats.job.task) else: #Success results[stats.job.task] = stats.result return failed_list, results
class GearJobs(object): def __init__(self): self.poll_timeout = cfg.CONF['admin_api']['stats_poll_timeout'] self.poll_retry = cfg.CONF['admin_api']['stats_poll_timeout_retry'] server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl'] }) self.gm_client = JSONGearmanClient(server_list) def send_pings(self, node_list): # TODO: lots of duplicated code that needs cleanup list_of_jobs = [] failed_list = [] node_status = dict() retry_list = [] # The message name is STATS for historical reasons. Real # data statistics are gathered with METRICS messages. job_data = {"hpcs_action": "STATS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_timeout ) for ping in submitted_pings: if ping.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now LOG.error('Gearman Job server fail') continue if ping.timed_out: # Ping timeout retry_list.append(ping.job.task) continue if ping.result['hpcs_response'] == 'FAIL': if ( 'status' in ping.result and ping.result['status'] == 'DELETED' ): continue # Error returned by Gearman failed_list.append(ping.job.task) continue else: if 'nodes' in ping.result: node_status[ping.job.task] = ping.result['nodes'] list_of_jobs = [] if len(retry_list) > 0: LOG.info( "{0} pings timed out, retrying".format(len(retry_list)) ) for node in retry_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_retry ) for ping in submitted_pings: if ping.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now LOG.error('Gearman Job server fail') continue if ping.timed_out: # Ping timeout failed_list.append(ping.job.task) continue if ping.result['hpcs_response'] == 'FAIL': if ( 'status' in ping.result and ping.result['status'] == 'DELETED' ): continue # Error returned by Gearman failed_list.append(ping.job.task) continue else: if 'nodes' in ping.result: node_status[ping.job.task] = ping.result['nodes'] return failed_list, node_status def offline_check(self, node_list): list_of_jobs = [] failed_list = [] job_data = {"hpcs_action": "DIAGNOSTICS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_pings = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_timeout ) for ping in submitted_pings: if ping.state == JOB_UNKNOWN: LOG.error( "Gearman Job server failed during OFFLINE check of {0}". format(ping.job.task) ) elif ping.timed_out: failed_list.append(ping.job.task) elif ping.result['network'] == 'FAIL': failed_list.append(ping.job.task) else: gearman_count = 0 gearman_fail = 0 for gearman_test in ping.result['gearman']: gearman_count += 1 if gearman_test['status'] == 'FAIL': gearman_fail += 1 # Need 2/3rds gearman up max_fail_count = gearman_count / 3 if gearman_fail > max_fail_count: failed_list.append(ping.job.task) return failed_list def get_stats(self, node_list): # TODO: lots of duplicated code that needs cleanup list_of_jobs = [] failed_list = [] retry_list = [] results = {} job_data = {"hpcs_action": "METRICS"} for node in node_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_stats = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_timeout ) for stats in submitted_stats: if stats.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now retry_list.append(stats.job.task) elif stats.timed_out: # Timeout retry_list.append(stats.job.task) elif stats.result['hpcs_response'] == 'FAIL': # Error returned by Gearman failed_list.append(stats.job.task) else: #Success results[stats.job.task] = stats.result list_of_jobs = [] if len(retry_list) > 0: LOG.info( "{0} Statistics gathering timed out, retrying". format(len(retry_list)) ) for node in retry_list: list_of_jobs.append(dict(task=str(node), data=job_data)) submitted_stats = self.gm_client.submit_multiple_jobs( list_of_jobs, background=False, wait_until_complete=True, poll_timeout=self.poll_retry ) for stats in submitted_stats: if stats.state == JOB_UNKNOWN: # TODO: Gearman server failed, ignoring for now LOG.error( "Gearman Job server failed gathering statistics " "on {0}".format(stats.job.task) ) failed_list.append(stats.job.task) elif stats.timed_out: # Timeout failed_list.append(stats.job.task) elif stats.result['hpcs_response'] == 'FAIL': # Error returned by Gearman failed_list.append(stats.job.task) else: #Success results[stats.job.task] = stats.result return failed_list, results
def __init__(self, logger, args): self.logger = logger self.gm_client = JSONGearmanClient(args.server)
class GearmanWork(object): def __init__(self): server_list = [] for server in cfg.CONF['gearman']['servers']: host, port = server.split(':') server_list.append({ 'host': host, 'port': int(port), 'keyfile': cfg.CONF['gearman']['ssl_key'], 'certfile': cfg.CONF['gearman']['ssl_cert'], 'ca_certs': cfg.CONF['gearman']['ssl_ca'], 'keepalive': cfg.CONF['gearman']['keepalive'], 'keepcnt': cfg.CONF['gearman']['keepcnt'], 'keepidle': cfg.CONF['gearman']['keepidle'], 'keepintvl': cfg.CONF['gearman']['keepintvl'] }) self.gearman_client = JSONGearmanClient(server_list) def send_delete_message(self, message): LOG.info("Sending %d gearman messages", len(message)) job_status = self.gearman_client.submit_multiple_jobs( message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=30.0) delete_count = 0 for status in job_status: if status.state == JOB_UNKNOWN: LOG.error('Gearman Job server fail') continue if status.timed_out: LOG.error('Gearman timeout whilst deleting device') continue if status.result['response'] == 'FAIL': LOG.error( 'Pool manager failed to delete a device, removing from DB') delete_count += 1 with db_session() as session: session.query(Device).\ filter(Device.name == status.result['name']).delete() session.commit() LOG.info('%d freed devices delete from pool', delete_count) def send_vips_message(self, message): # TODO: make this gearman part more async, not wait for all builds LOG.info("Sending %d gearman messages", len(message)) job_status = self.gearman_client.submit_multiple_jobs( message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=3600.0) built_count = 0 for status in job_status: if status.state == JOB_UNKNOWN: LOG.error('Gearman Job server fail') continue if status.timed_out: LOG.error('Gearman timeout whilst building vip') continue if status.result['response'] == 'FAIL': LOG.error('Pool manager failed to build a vip') continue built_count += 1 try: self._add_vip(status.result) except: LOG.exception('Could not add vip to DB, node data: {0}'.format( status.result)) LOG.info( '{vips} vips built and added to pool'.format(vips=built_count)) def send_create_message(self, message): # TODO: make this gearman part more async, not wait for all builds LOG.info("Sending {0} gearman messages".format(len(message))) job_status = self.gearman_client.submit_multiple_jobs( message, background=False, wait_until_complete=True, max_retries=10, poll_timeout=3600.0) built_count = 0 for status in job_status: if status.state == JOB_UNKNOWN: LOG.error('Gearman Job server fail') continue if status.timed_out: LOG.error('Gearman timeout whilst building device') continue if status.result['response'] == 'FAIL': LOG.error('Pool manager failed to build a device') if 'name' in status.result: self._add_bad_node(status.result) continue built_count += 1 try: self._add_node(status.result) except: LOG.exception( 'Could not add node to DB, node data: {0}'.format( status.result)) LOG.info('{nodes} devices built and added to pool'.format( nodes=built_count)) def _add_vip(self, data): LOG.info('Adding vip {0} to DB'.format(data['ip'])) vip = Vip() vip.ip = int(ipaddress.IPv4Address(unicode(data['ip']))) with db_session() as session: session.add(vip) session.commit() def _add_node(self, data): LOG.info('Adding device {0} to DB'.format(data['name'])) device = Device() device.name = data['name'] device.publicIpAddr = data['addr'] # TODO: kill this field, make things use publicIpAddr instead device.floatingIpAddr = data['addr'] device.az = data['az'] device.type = data['type'] device.pingCount = 0 device.status = 'OFFLINE' device.created = None with db_session() as session: session.add(device) session.commit() def _add_bad_node(self, data): LOG.info('Adding bad device {0} to DB to be deleted'.format( data['name'])) device = Device() device.name = data['name'] device.publicIpAddr = data['addr'] # TODO: kill this field, make things use publicIpAddr instead device.floatingIpAddr = data['addr'] device.az = data['az'] device.type = data['type'] device.pingCount = 0 device.status = 'DELETED' device.created = None with db_session() as session: session.add(device) session.commit()