def PUT(self, node_id): node = self.get_object_or_404(Node, node_id) if not node.attributes: node.attributes = NodeAttributes(node_id=node.id) data = self.validator.validate_update(web.data()) for key, value in data.iteritems(): setattr(node, key, value) if key == 'cluster_id': if key: self.allow_network_assignment_to_all_interfaces(node) self.assign_networks_to_main_interface(node) else: self.clear_assigned_networks(node) self.clear_all_allowed_networks(node) if not node.status in ('provisioning', 'deploying') \ and "role" in data or "cluster_id" in data: try: node.attributes.volumes = \ node.volume_manager.gen_volumes_info() except Exception as exc: msg = ( u"Failed to generate volumes " "info for node '{0}': '{1}'" ).format( node.name or data.get("mac") or data.get("id"), str(exc) or "see logs for details" ) logger.warning(traceback.format_exc()) notifier.notify("error", msg, node_id=node.id) self.db.commit() return self.render(node)
def run(self): super(FakeDeletionThread, self).run() receiver = NailgunReceiver receiver.initialize() kwargs = { 'task_uuid': self.task_uuid, 'nodes': self.data['args']['nodes'], 'status': 'ready' } nodes_to_restore = self.data['args'].get('nodes_to_restore', []) tick_interval = int(settings.FAKE_TASKS_TICK_INTERVAL) or 3 resp_method = getattr(receiver, self.respond_to) resp_method(**kwargs) orm = scoped_session( sessionmaker(bind=engine, query_cls=NoCacheQuery) ) for node in nodes_to_restore: # Offline node just deleted from db # and could not recreated with status # discover if not node.online: continue node.status = 'discover' orm.add(node) orm.commit() ram = round(node.meta.get('ram') or 0, 1) cores = node.meta.get('cores') or 'unknown' notifier.notify("discover", "New node with %s CPU core(s) " "and %s GB memory is discovered" % (cores, ram), node_id=node.id) receiver.stop()
def run(self): super(FakeDeletionThread, self).run() receiver = NailgunReceiver receiver.initialize() kwargs = { 'task_uuid': self.task_uuid, 'nodes': self.data['args']['nodes'], 'status': 'ready' } nodes_to_restore = self.data['args'].get('nodes_to_restore', []) tick_interval = int(settings.FAKE_TASKS_TICK_INTERVAL) or 3 resp_method = getattr(receiver, self.respond_to) resp_method(**kwargs) orm = scoped_session(sessionmaker(bind=engine, query_cls=NoCacheQuery)) for node in nodes_to_restore: # Offline node just deleted from db # and could not recreated with status # discover if not node.online: continue node.status = 'discover' orm.add(node) orm.commit() ram = round(node.meta.get('ram') or 0, 1) cores = node.meta.get('cores') or 'unknown' notifier.notify("discover", "New node with %s CPU core(s) " "and %s GB memory is discovered" % (cores, ram), node_id=node.id) receiver.stop()
def run(self): while not self.stoprequest.isSet(): self.db.expire_all() for node_db in self.db.query(Node).filter( # nodes may become unresponsive while provisioning not_(Node.status == 'provisioning') ): timedelta = (datetime.now() - node_db.timestamp).seconds if timedelta > self.timeout: logger.warning( u"Node '{0}' seems to be offline " "for {1} seconds...".format( node_db.name, timedelta ) ) if node_db.online: node_db.online = False self.db.add(node_db) self.db.commit() notifier.notify( "error", u"Node '{0}' has gone away".format( node_db.name or node_db.mac ), node_id=node_db.id ) self.sleep()
def POST(self): data = self.checked_data() node = Node() for key, value in data.iteritems(): if key == "meta": node.create_meta(value) else: setattr(node, key, value) node.name = "Untitled (%s)" % data['mac'][-5:] node.timestamp = datetime.now() self.db.add(node) self.db.commit() node.attributes = NodeAttributes() try: node.attributes.volumes = node.volume_manager.gen_volumes_info() if node.cluster: node.cluster.add_pending_changes( "disks", node_id=node.id ) except Exception as exc: msg = ( u"Failed to generate volumes " "info for node '{0}': '{1}'" ).format( node.name or data.get("mac") or data.get("id"), str(exc) or "see logs for details" ) logger.warning(traceback.format_exc()) notifier.notify("error", msg, node_id=node.id) self.db.add(node) self.db.commit() network_manager = NetworkManager() # Add interfaces for node from 'meta'. if node.meta and node.meta.get('interfaces'): network_manager.update_interfaces_info(node.id) if node.cluster_id: network_manager.allow_network_assignment_to_all_interfaces(node.id) network_manager.assign_networks_to_main_interface(node.id) try: ram = str(round(float( node.meta['memory']['total']) / 1073741824, 1)) except (KeyError, TypeError, ValueError): ram = "unknown" cores = str(node.meta.get('cpu', {}).get('total', "unknown")) notifier.notify("discover", "New node with %s CPU core(s) " "and %s GB memory is discovered" % (cores, ram), node_id=node.id) raise web.webapi.created(json.dumps( NodeHandler.render(node), indent=4 ))
def POST(self): data = self.validator.validate(web.data()) node = Node() for key, value in data.iteritems(): setattr(node, key, value) node.name = "Untitled (%s)" % data['mac'][-5:] node.timestamp = datetime.now() self.db.add(node) self.db.commit() node.attributes = NodeAttributes() try: node.attributes.volumes = node.volume_manager.gen_volumes_info() if node.cluster: node.cluster.add_pending_changes( "disks", node_id=node.id ) except Exception as exc: msg = ( u"Failed to generate volumes " "info for node '{0}': '{1}'" ).format( node.name or data.get("mac") or data.get("id"), str(exc) or "see logs for details" ) logger.warning(traceback.format_exc()) notifier.notify("error", msg, node_id=node.id) self.db.add(node) self.db.commit() # Add interfaces for node from 'meta'. if node.meta and node.meta.get('interfaces'): nics = self.get_nics_from_meta(node) map(self.db.add, nics) self.db.commit() if node.cluster_id: self.allow_network_assignment_to_all_interfaces(node) self.assign_networks_to_main_interface(node) self.db.commit() try: ram = str(round(float( node.meta['memory']['total']) / 1073741824, 1)) except (KeyError, TypeError, ValueError): ram = "unknown" cores = str(node.meta.get('cpu', {}).get('total', "unknown")) notifier.notify("discover", "New node with %s CPU core(s) " "and %s GB memory is discovered" % (cores, ram), node_id=node.id) raise web.webapi.created(json.dumps( NodeHandler.render(node), indent=4 ))
def _success_action(cls, task, status, progress): # check if all nodes are ready if any(map(lambda n: n.status == 'error', task.cluster.nodes)): cls._error_action(task, 'error', 100) return if task.cluster.mode in ('singlenode', 'multinode'): # determining horizon url - it's an IP # of a first cluster controller controller = cls.db.query(Node).filter_by( cluster_id=task.cluster_id, role='controller').first() if controller: logger.debug( u"Controller is found, node_id=%s, " "getting it's IP addresses", controller.id) public_net = filter( lambda n: n['name'] == 'public' and 'ip' in n, cls.network_manager.get_node_networks(controller.id)) if public_net: horizon_ip = public_net[0]['ip'].split('/')[0] message = ( u"Deployment of environment '{0}' is done. " "Access WebUI of OpenStack at http://{1}/ or via " "internal network at http://{2}/").format( task.cluster.name, horizon_ip, controller.ip) else: message = ( u"Deployment of environment '{0}' is done").format( task.cluster.name) logger.warning(u"Public ip for controller node " "not found in '{0}'".format( task.cluster.name)) else: message = (u"Deployment of environment" " '{0}' is done").format(task.cluster.name) logger.warning("Controller node not found in '{0}'".format( task.cluster.name)) elif task.cluster.mode == 'ha': # determining horizon url in HA mode - it's vip # from a public network saved in task cache args = task.cache.get('args') try: vip = args['attributes']['public_vip'] message = (u"Deployment of environment '{0}' is done. " "Access WebUI of OpenStack at http://{1}/").format( task.cluster.name, vip) except Exception as exc: logger.error(": ".join([str(exc), traceback.format_exc()])) message = (u"Deployment of environment" " '{0}' is done").format(task.cluster.name) logger.warning(u"Cannot find virtual IP for '{0}'".format( task.cluster.name)) notifier.notify("done", message, task.cluster_id) TaskHelper.update_task_status(task.uuid, status, progress, message)
def _error_action(cls, task, status, progress, message=None): if message: message = u"Deployment has failed. {0}".format(message) else: message = u"Deployment has failed. Check these nodes:\n{0}".format( cls._generate_error_message(task, error_types=('deploy', 'provision'), names_only=True)) notifier.notify("error", message, task.cluster_id) TaskHelper.update_task_status(task.uuid, status, progress, message)
def remove_cluster_resp(cls, **kwargs): network_manager = NetworkManager() logger.info("RPC method remove_cluster_resp received: %s" % kwargs) task_uuid = kwargs.get('task_uuid') cls.remove_nodes_resp(**kwargs) task = db().query(Task).filter_by(uuid=task_uuid).first() cluster = task.cluster if task.status in ('ready',): logger.debug("Removing environment itself") cluster_name = cluster.name nws = itertools.chain( *[n.networks for n in cluster.network_groups] ) ips = db().query(IPAddr).filter( IPAddr.network.in_([n.id for n in nws]) ) map(db().delete, ips) db().commit() db().delete(cluster) db().commit() # Dmitry's hack for clearing VLANs without networks network_manager.clear_vlans() notifier.notify( "done", u"Environment '%s' and all its nodes are deleted" % ( cluster_name ) ) elif task.status in ('error',): cluster.status = 'error' db().add(cluster) db().commit() if not task.message: task.message = "Failed to delete nodes:\n{0}".format( cls._generate_error_message( task, error_types=('deletion',) ) ) notifier.notify( "error", task.message, cluster.id )
def remove_nodes_resp(cls, **kwargs): logger.info("RPC method remove_nodes_resp received: %s" % kwargs) task_uuid = kwargs.get('task_uuid') nodes = kwargs.get('nodes') or [] error_nodes = kwargs.get('error_nodes') or [] error_msg = kwargs.get('error') status = kwargs.get('status') progress = kwargs.get('progress') for node in nodes: node_db = cls.db.query(Node).get(node['uid']) if not node_db: logger.error( u"Failed to delete node '%s': node doesn't exist", str(node) ) break cls.db.delete(node_db) for node in error_nodes: node_db = cls.db.query(Node).get(node['uid']) if not node_db: logger.error( u"Failed to delete node '%s' marked as error from Naily:" " node doesn't exist", str(node) ) break node_db.pending_deletion = False node_db.status = 'error' cls.db.add(node_db) node['name'] = node_db.name cls.db.commit() success_msg = u"No nodes were removed" err_msg = u"No errors occurred" if nodes: success_msg = u"Successfully removed {0} node(s)".format( len(nodes) ) notifier.notify("done", success_msg) if error_nodes: err_msg = u"Failed to remove {0} node(s): {1}".format( len(error_nodes), ', '.join( [n.get('name') or "ID: {0}".format(n['uid']) for n in error_nodes]) ) notifier.notify("error", err_msg) if not error_msg: error_msg = ". ".join([success_msg, err_msg]) TaskHelper.update_task_status(task_uuid, status, progress, error_msg)
def remove_nodes_resp(cls, **kwargs): logger.info("RPC method remove_nodes_resp received: %s" % kwargs) task_uuid = kwargs.get('task_uuid') nodes = kwargs.get('nodes') or [] error_nodes = kwargs.get('error_nodes') or [] error_msg = kwargs.get('error') status = kwargs.get('status') progress = kwargs.get('progress') for node in nodes: node_db = cls.db.query(Node).get(node['uid']) if not node_db: logger.error(u"Failed to delete node '%s': node doesn't exist", str(node)) break cls.db.delete(node_db) for node in error_nodes: node_db = cls.db.query(Node).get(node['uid']) if not node_db: logger.error( u"Failed to delete node '%s' marked as error from Naily:" " node doesn't exist", str(node)) break node_db.pending_deletion = False node_db.status = 'error' cls.db.add(node_db) node['name'] = node_db.name cls.db.commit() success_msg = u"No nodes were removed" err_msg = u"No errors occurred" if nodes: success_msg = u"Successfully removed {0} node(s)".format( len(nodes)) notifier.notify("done", success_msg) if error_nodes: err_msg = u"Failed to remove {0} node(s): {1}".format( len(error_nodes), ', '.join([ n.get('name') or "ID: {0}".format(n['uid']) for n in error_nodes ])) notifier.notify("error", err_msg) if not error_msg: error_msg = ". ".join([success_msg, err_msg]) TaskHelper.update_task_status(task_uuid, status, progress, error_msg)
def _error_action(cls, task, status, progress, message=None): if message: message = u"Deployment has failed. {0}".format(message) else: message = u"Deployment has failed. Check these nodes:\n{0}".format( cls._generate_error_message( task, error_types=('deploy', 'provision'), names_only=True ) ) notifier.notify( "error", message, task.cluster_id ) TaskHelper.update_task_status(task.uuid, status, progress, message)
def update_status_nodes(self): for node_db in self.db.query(Node).filter( # nodes may become unresponsive while provisioning not_(Node.status == 'provisioning')): timedelta = (datetime.now() - node_db.timestamp).seconds if timedelta > self.timeout: logger.warning( u"Node '{0}' seems to be offline " "for {1} seconds...".format( node_db.name, timedelta)) if node_db.online: node_db.online = False self.db.commit() notifier.notify( "error", u"Node '{0}' has gone away".format( node_db.human_readable_name), node_id=node_db.id)
def run(self): super(FakeDeletionThread, self).run() receiver = NailgunReceiver receiver.initialize() kwargs = { 'task_uuid': self.task_uuid, 'nodes': self.data['args']['nodes'], 'status': 'ready' } nodes_to_restore = self.data['args'].get('nodes_to_restore', []) tick_interval = int(settings.FAKE_TASKS_TICK_INTERVAL) or 3 resp_method = getattr(receiver, self.respond_to) resp_method(**kwargs) orm = make_session() for node_data in nodes_to_restore: node = Node(**node_data) # Offline node just deleted from db # and could not recreated with status # discover if not node.online: continue node.status = 'discover' orm.add(node) orm.commit() node.attributes = NodeAttributes(node_id=node.id) node.attributes.volumes = node.volume_manager.gen_volumes_info() network_manager = NetworkManager(db=orm) network_manager.update_interfaces_info(node.id) orm.commit() ram = round(node.meta.get('ram') or 0, 1) cores = node.meta.get('cores') or 'unknown' notifier.notify("discover", "New node with %s CPU core(s) " "and %s GB memory is discovered" % (cores, ram), node_id=node.id) receiver.stop()
def remove_cluster_resp(cls, **kwargs): logger.info("RPC method remove_cluster_resp received: %s" % kwargs) task_uuid = kwargs.get('task_uuid') cls.remove_nodes_resp(**kwargs) task = cls.db.query(Task).filter_by(uuid=task_uuid).first() cluster = task.cluster if task.status in ('ready', ): logger.debug("Removing environment itself") cluster_name = cluster.name nws = itertools.chain( *[n.networks for n in cluster.network_groups]) ips = cls.db.query(IPAddr).filter( IPAddr.network.in_([n.id for n in nws])) map(cls.db.delete, ips) cls.db.commit() cls.db.delete(cluster) cls.db.commit() # Dmitry's hack for clearing VLANs without networks cls.network_manager.clear_vlans() notifier.notify( "done", u"Environment '%s' and all its nodes are deleted" % (cluster_name)) elif task.status in ('error', ): cluster.status = 'error' cls.db.add(cluster) cls.db.commit() if not task.message: task.message = "Failed to delete nodes:\n{0}".format( cls._generate_error_message(task, error_types=('deletion', ))) notifier.notify("error", task.message, cluster.id)
def checked_data(self, validate_method=None): try: if validate_method: data = validate_method(web.data()) else: data = self.validator.validate(web.data()) except ( errors.InvalidInterfacesInfo, errors.InvalidMetadata ) as exc: notifier.notify("error", str(exc)) raise web.badrequest(message=str(exc)) except ( errors.AlreadyExists ) as exc: err = web.conflict() err.message = exc.message raise err except ( errors.InvalidData, Exception ) as exc: raise web.badrequest(message=str(exc)) return data
def _success_action(cls, task, status, progress): network_manager = NetworkManager() # check if all nodes are ready if any(map(lambda n: n.status == 'error', task.cluster.nodes)): cls._error_action(task, 'error', 100) return if task.cluster.mode in ('singlenode', 'multinode'): # determining horizon url - it's an IP # of a first cluster controller controller = db().query(Node).filter_by( cluster_id=task.cluster_id, role='controller' ).first() if controller: logger.debug( u"Controller is found, node_id=%s, " "getting it's IP addresses", controller.id ) public_net = filter( lambda n: n['name'] == 'public' and 'ip' in n, network_manager.get_node_networks(controller.id) ) if public_net: horizon_ip = public_net[0]['ip'].split('/')[0] message = ( u"Deployment of environment '{0}' is done. " "Access the OpenStack dashboard (Horizon) at " "http://{1}/ or via internal network at http://{2}/" ).format( task.cluster.name, horizon_ip, controller.ip ) else: message = ( u"Deployment of environment '{0}' is done" ).format(task.cluster.name) logger.warning( u"Public ip for controller node " "not found in '{0}'".format(task.cluster.name) ) else: message = ( u"Deployment of environment" " '{0}' is done" ).format(task.cluster.name) logger.warning("Controller node not found in '{0}'".format( task.cluster.name )) elif task.cluster.mode == 'ha': # determining horizon url in HA mode - it's vip # from a public network saved in task cache args = task.cache.get('args') try: vip = args['attributes']['public_vip'] message = ( u"Deployment of environment '{0}' is done. " "Access the OpenStack dashboard (Horizon) at http://{1}/" ).format( task.cluster.name, vip ) except Exception as exc: logger.error(": ".join([ str(exc), traceback.format_exc() ])) message = ( u"Deployment of environment" " '{0}' is done" ).format(task.cluster.name) logger.warning( u"Cannot find virtual IP for '{0}'".format( task.cluster.name ) ) notifier.notify( "done", message, task.cluster_id ) TaskHelper.update_task_status(task.uuid, status, progress, message)
def PUT(self): data = self.validator.validate_collection_update(web.data()) q = self.db.query(Node) nodes_updated = [] for nd in data: is_agent = nd.pop("is_agent") if "is_agent" in nd else False node = None if "mac" in nd: node = q.filter_by(mac=nd["mac"]).first() \ or self.validator.validate_existent_node_mac(nd) else: node = q.get(nd["id"]) if nd.get("cluster_id") is None and node.cluster: node.cluster.clear_pending_changes(node_id=node.id) old_cluster_id = node.cluster_id for key, value in nd.iteritems(): if is_agent and (key, value) == ("status", "discover") \ and node.status == "provisioning": # We don't update provisioning back to discover logger.debug( "Node is already provisioning - " "status not updated by agent" ) continue setattr(node, key, value) if not node.attributes: node.attributes = NodeAttributes() self.db.commit() if not node.attributes.volumes: node.attributes.volumes = \ node.volume_manager.gen_volumes_info() self.db.commit() if not node.status in ('provisioning', 'deploying'): variants = ( "disks" in node.meta and len(node.meta["disks"]) != len( filter( lambda d: d["type"] == "disk", node.attributes.volumes ) ), "role" in nd, "cluster_id" in nd ) if any(variants): try: node.attributes.volumes = \ node.volume_manager.gen_volumes_info() if node.cluster: node.cluster.add_pending_changes( "disks", node_id=node.id ) except Exception as exc: msg = ( "Failed to generate volumes " "info for node '{0}': '{1}'" ).format( node.name or data.get("mac") or data.get("id"), str(exc) or "see logs for details" ) logger.warning(traceback.format_exc()) notifier.notify("error", msg, node_id=node.id) self.db.commit() if is_agent: node.timestamp = datetime.now() if not node.online: node.online = True msg = u"Node '{0}' is back online".format( node.name or node.mac ) logger.info(msg) notifier.notify( "discover", msg, node_id=node.id ) # Update node's NICs. if node.meta and 'interfaces' in node.meta: db_nics = list(node.interfaces) nics = self.get_nics_from_meta(node) for nic in nics: db_nic = filter(lambda i: i.mac == nic.mac, db_nics) if not db_nic: self.db.add(nic) continue db_nic = db_nic[0] for key in ('name', 'current_speed', 'max_speed'): setattr(db_nic, key, getattr(nic, key)) db_nics.remove(db_nic) map(self.db.delete, db_nics) nodes_updated.append(node) self.db.commit() if 'cluster_id' in nd and nd['cluster_id'] != old_cluster_id: if old_cluster_id: self.clear_assigned_networks(node) self.clear_all_allowed_networks(node) if nd['cluster_id']: self.allow_network_assignment_to_all_interfaces(node) self.assign_networks_to_main_interface(node) self.db.commit() return map(NodeHandler.render, nodes_updated)
def deploy_resp(cls, **kwargs): logger.info("RPC method deploy_resp received: %s" % kwargs) task_uuid = kwargs.get('task_uuid') nodes = kwargs.get('nodes') or [] message = kwargs.get('error') status = kwargs.get('status') progress = kwargs.get('progress') task = db().query(Task).filter_by(uuid=task_uuid).first() if not task: # No task found - nothing to do here, returning logger.warning( u"No task with uuid '{0}'' found - nothing changed".format( task_uuid ) ) return if not status: status = task.status error_nodes = [] # First of all, let's update nodes in database for node in nodes: node_db = db().query(Node).get(node['uid']) if not node_db: logger.warning( u"No node found with uid '{0}' - nothing changed".format( node['uid'] ) ) continue update_fields = ( 'error_msg', 'error_type', 'status', 'progress', 'online' ) for param in update_fields: if param in node: logger.debug( u"Updating node {0} - set {1} to {2}".format( node['uid'], param, node[param] ) ) setattr(node_db, param, node[param]) if param == 'progress' and node.get('status') == 'error' \ or node.get('online') is False: # If failure occurred with node # it's progress should be 100 node_db.progress = 100 # Setting node error_msg for offline nodes if node.get('online') is False \ and not node_db.error_msg: node_db.error_msg = u"Node is offline" # Notification on particular node failure notifier.notify( "error", u"Failed to deploy node '{0}': {1}".format( node_db.name, node_db.error_msg or "Unknown error" ), cluster_id=task.cluster_id, node_id=node['uid'], task_uuid=task_uuid ) db().add(node_db) db().commit() # We should calculate task progress by nodes info task = db().query(Task).filter_by(uuid=task_uuid).first() coeff = settings.PROVISIONING_PROGRESS_COEFF or 0.3 if nodes and not progress: nodes_progress = [] nodes_db = db().query(Node).filter_by( cluster_id=task.cluster_id).all() for node in nodes_db: if node.status == "discover": nodes_progress.append(0) elif not node.online: nodes_progress.append(100) elif node.status in ['provisioning', 'provisioned'] or \ node.needs_reprovision: nodes_progress.append(float(node.progress) * coeff) elif node.status in ['deploying', 'ready'] or \ node.needs_redeploy: nodes_progress.append( 100.0 * coeff + float(node.progress) * (1.0 - coeff) ) if nodes_progress: progress = int( float(sum(nodes_progress)) / len(nodes_progress) ) # Let's check the whole task status if status in ('error',): cls._error_action(task, status, progress, message) elif status in ('ready',): cls._success_action(task, status, progress) else: TaskHelper.update_task_status(task.uuid, status, progress, message)
def deploy_resp(cls, **kwargs): logger.info("RPC method deploy_resp received: %s" % kwargs) task_uuid = kwargs.get('task_uuid') nodes = kwargs.get('nodes') or [] message = kwargs.get('error') status = kwargs.get('status') progress = kwargs.get('progress') task = cls.db.query(Task).filter_by(uuid=task_uuid).first() if not task: # No task found - nothing to do here, returning logger.warning( u"No task with uuid '{0}'' found - nothing changed".format( task_uuid)) return if not status: status = task.status error_nodes = [] # First of all, let's update nodes in database for node in nodes: node_db = cls.db.query(Node).get(node['uid']) if not node_db: logger.warning( u"No node found with uid '{0}' - nothing changed".format( node['uid'])) continue update_fields = ('error_msg', 'error_type', 'status', 'progress', 'online') for param in update_fields: if param in node: logger.debug(u"Updating node {0} - set {1} to {2}".format( node['uid'], param, node[param])) setattr(node_db, param, node[param]) if param == 'progress' and node.get('status') == 'error' \ or node.get('online') is False: # If failure occurred with node # it's progress should be 100 node_db.progress = 100 # Setting node error_msg for offline nodes if node.get('online') is False \ and not node_db.error_msg: node_db.error_msg = u"Node is offline" # Notification on particular node failure notifier.notify( "error", u"Failed to deploy node '{0}': {1}".format( node_db.name, node_db.error_msg or "Unknown error"), cluster_id=task.cluster_id, node_id=node['uid'], task_uuid=task_uuid) cls.db.add(node_db) cls.db.commit() # We should calculate task progress by nodes info task = cls.db.query(Task).filter_by(uuid=task_uuid).first() coeff = settings.PROVISIONING_PROGRESS_COEFF or 0.3 if nodes and not progress: nodes_progress = [] nodes_db = cls.db.query(Node).filter_by( cluster_id=task.cluster_id).all() for node in nodes_db: if node.status == "discover": nodes_progress.append(0) elif not node.online: nodes_progress.append(100) elif node.status in ['provisioning', 'provisioned'] or \ node.needs_reprovision: nodes_progress.append(float(node.progress) * coeff) elif node.status in ['deploying', 'ready'] or \ node.needs_redeploy: nodes_progress.append(100.0 * coeff + float(node.progress) * (1.0 - coeff)) if nodes_progress: progress = int( float(sum(nodes_progress)) / len(nodes_progress)) # Let's check the whole task status if status in ('error', ): cls._error_action(task, status, progress, message) elif status in ('ready', ): cls._success_action(task, status, progress) else: TaskHelper.update_task_status(task.uuid, status, progress, message)
def PUT(self): data = self.checked_data( self.validator.validate_collection_update ) network_manager = NetworkManager() q = self.db.query(Node) nodes_updated = [] for nd in data: is_agent = nd.pop("is_agent") if "is_agent" in nd else False node = None if "mac" in nd: node = q.filter_by(mac=nd["mac"]).first() \ or self.validator.validate_existent_node_mac_update(nd) else: node = q.get(nd["id"]) if is_agent: node.timestamp = datetime.now() if not node.online: node.online = True msg = u"Node '{0}' is back online".format( node.human_readable_name) logger.info(msg) notifier.notify("discover", msg, node_id=node.id) self.db.commit() if nd.get("cluster_id") is None and node.cluster: node.cluster.clear_pending_changes(node_id=node.id) old_cluster_id = node.cluster_id for key, value in nd.iteritems(): if is_agent and (key, value) == ("status", "discover") \ and node.status == "provisioning": # We don't update provisioning back to discover logger.debug( "Node is already provisioning - " "status not updated by agent" ) continue if key == "meta": node.update_meta(value) else: setattr(node, key, value) self.db.commit() if not node.attributes: node.attributes = NodeAttributes() self.db.commit() if not node.attributes.volumes: node.attributes.volumes = \ node.volume_manager.gen_volumes_info() self.db.commit() if not node.status in ('provisioning', 'deploying'): variants = ( "disks" in node.meta and len(node.meta["disks"]) != len( filter( lambda d: d["type"] == "disk", node.attributes.volumes ) ), "role" in nd, "cluster_id" in nd ) if any(variants): try: node.attributes.volumes = \ node.volume_manager.gen_volumes_info() if node.cluster: node.cluster.add_pending_changes( "disks", node_id=node.id ) except Exception as exc: msg = ( "Failed to generate volumes " "info for node '{0}': '{1}'" ).format( node.name or data.get("mac") or data.get("id"), str(exc) or "see logs for details" ) logger.warning(traceback.format_exc()) notifier.notify("error", msg, node_id=node.id) self.db.commit() if is_agent: # Update node's NICs. if node.meta and 'interfaces' in node.meta: # we won't update interfaces if data is invalid network_manager.update_interfaces_info(node.id) nodes_updated.append(node) self.db.commit() if 'cluster_id' in nd and nd['cluster_id'] != old_cluster_id: if old_cluster_id: network_manager.clear_assigned_networks(node.id) network_manager.clear_all_allowed_networks(node.id) if nd['cluster_id']: network_manager.allow_network_assignment_to_all_interfaces( node.id ) network_manager.assign_networks_to_main_interface(node.id) return map(NodeHandler.render, nodes_updated)