def _check_ip_list(self, ipList=None): if not ipList: ipList = self.hosts invalid_list = [] for ip in ipList.split(','): if not utils.is_valid_ip(ip): invalid_list.append(ip) if invalid_list: term.fatal('{} is invalid.'.format(','.join(invalid_list))) exit(1)
def check_os_version(self, facts=None): _lower_version = [] for _host, _vars in facts['success'].iteritems(): # get system version _sysversion = str( _vars['ansible_facts']['ansible_distribution_version']) if _sysversion < '7': _lower_version.append([_host, _sysversion]) if _lower_version: term.fatal('Some machine\'s OS version dosen\'t support.') _length = max(max([len(str(x[0])) for x in _lower_version]), len('IP')) term.normal('IP'.ljust(_length + 2) + 'OS_Version') for _node in _lower_version: term.normal('{}{}'.format(_node[0].ljust(_length + 2), _node[1])) exit(1)
def __delete_component(self, config=None, component=None, pattern=None, uuid=None): if component == 'pd': try: self._cluster.del_pd(uuid) except exceptions.TiOPSException as e: term.fatal( 'Unable to delete PD node from cluster: {}'.format(e)) exit(1) if component == 'tikv': _tikv_info = '' for _tikv_node in config[pattern]: if _tikv_node['uuid'] != uuid: continue if _tikv_node['offline']: return _tikv_info = _tikv_node for ctikv in self._tikv_stores['stores']: # check if node in cluster if '{}:{}'.format( _tikv_info['ip'], _tikv_info['port']) == ctikv['store']['address']: _store_id = ctikv['store']['id'] # delete store through api try: self._cluster.del_store(_store_id) except exceptions.TiOPSException as e: term.fatal('Unable to delete store: {}'.format(e)) exit(1) if component == 'drainer': _binlog = modules.BinlogAPI(topology=self.topology) _binlog.delete_drainer(node_id=uuid) if component == 'pump': _binlog = modules.BinlogAPI(topology=self.topology) _binlog.delete_pump(node_id=uuid)
def check_os_platform(self, facts=None): _unsupport_os = [] # get operation system platform for _host, _vars in facts['success'].iteritems(): _platform = _vars['ansible_facts']['ansible_os_family'] if 'redhat' == _platform.lower(): continue _unsupport_os.append([_host, _platform]) if _unsupport_os: term.fatal( 'Some machine\'s OS is not support, Please use Redhat / CentOS.' ) _length = max(max([len(str(x[0])) for x in _unsupport_os]), len('IP')) term.normal('IP'.ljust(_length + 2) + 'OS_Family') for _node in _unsupport_os: term.normal('{}{}'.format(_node[0].ljust(_length + 2), _node[1])) exit(1)
def _process(self, component=None, pattern=None, node=None, role=None): _unhealth_node = [] for _pd_node in self._cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='scaleIn') _current_pd_num = len(self._pd_status) _current_tikv_num = len(self._tikv_stores) if 'pd_servers' in self._diff and len( self._diff['pd_servers']) == _current_pd_num: term.fatal('Can not delete all pd node.') exit(1) if 'tikv_servers' in self._diff and len( self._diff['tikv_servers']) == _current_tikv_num: term.fatal('Can not delete all tikv node.') exit(1) term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, self._diff) if not component and not pattern: continue uuid = [x['uuid'] for x in self._diff[pattern]] term.normal('Delete {}, node list: {}'.format( component, ','.join(uuid))) for _uuid in uuid: self.__delete_component(self._diff, component, pattern, _uuid) if component not in ['tikv', 'pump', 'drainer']: self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.destroy_component(component=component, pattern=pattern, node=_uuid) if component != 'blackbox_exporter': self.topology.replace(self.topology.remove(_uuid)[0])
def main(args=None): try: action = args.action except AttributeError: pass if action == 'version': print(term.plain(TiOPSVer())) exit(0) if action == 'quickdeploy': term.warn( 'The quick deploy mode is for demo and testing, do NOT use in production!' ) # do init _init = init.Init(args) try: _init.init(demo=True) _init.init_network(demo=True) _init.init_host(demo=True) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) # do deploy topo = topology.Topology(args=args, merge=True) try: op.OprDeploy(args, topo, demo=True).do() op.OprStart(args, topo, demo=True).do() tm.TUIModule(topo, args=args).display() except TiOPSRuntimeError as e: tierror(e) except TiOPSRequestError as e: msg = "{}, URL {} returned {}, please check the network and try again.".format( e.msg, e.url, e.code) term.error(msg) sys.exit(1) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'bootstrap-local': _init = init.Init(args) try: _init.init() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'bootstrap-ssh': _init = init.Init(args) try: _init.init_network() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'bootstrap-host': _init = init.Init(args) try: _init.init_host() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) else: try: if action not in ['deploy', 'display']: topo = topology.Topology(args) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) if action == 'display': try: _cluster_name = args.cluster_name except AttributeError: _cluster_name = None try: if _cluster_name and len(_cluster_name) > 0: topo = topology.Topology(args) _list = False else: topo = None _list = True tm.TUIModule(topo, args=args).display(_list) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'deploy': topo = topology.Topology(args=args, merge=True) try: op.OprDeploy(args, topo).do() tm.TUIModule(topo, args=args).display() except TiOPSRuntimeError as e: tierror(e) except TiOPSRequestError as e: msg = "{}, URL {} returned {}, please check the network and try again.".format( e.msg, e.url, e.code) term.error(msg) sys.exit(1) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'start': try: op.OprStart(args, topo).do(node=args.node_id, role=args.role) tm.TUIModule(topo, args=args, status=True).display() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'stop': try: op.OprStop(args, topo).do(node=args.node_id, role=args.role) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'restart': try: op.OprRestart(args, topo).do(node=args.node_id, role=args.role) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'reload': try: op.OprReload(args, topo).do(node=args.node_id, role=args.role) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'upgrade': try: op.OprUpgrade(args, topo).do(node=args.node_id, role=args.role) except TiOPSRuntimeError as e: tierror(e) except TiOPSRequestError as e: msg = "{}, URL {} returned {}, please check the network and try again.".format( e.msg, e.url, e.code) term.error(msg) sys.exit(1) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'destroy': try: op.OprDestroy(args, topo).do() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'edit-config': try: Action(topo=topo).edit_file() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'scale-out': addTopo = utils.read_yaml(args.topology) try: op.OprScaleOut(args, topo, addTopo).do() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'scale-in': try: op.OprScaleIn(args, topo, args.node_id).do(node=args.node_id) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'exec': try: op.OprExec(args, topo).do(node=args.node_id, role=args.role) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1)
def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Reload specified node in cluster.') elif role: term.notice('Reload specified role in cluster.') else: term.notice('Reload TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) _cluster = modules.ClusterAPI(topology=self.topology) _unhealth_node = [] for _pd_node in _cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='reload') term.info('Check ssh connection.') self.act.check_ssh_connection() # every time should only contain one item for service in self.topology.service_group: component, pattern = self.check_exist(service=service, config=_topology) if not component and not pattern: continue # upgrade pd server, upgrade leader node finally if component == 'pd': _pd_list = [] for _node in _topology[pattern]: if _node['uuid'] == _cluster.pd_leader(): _leader = _node else: _pd_list.append(_node) _pd_list.append(_leader) for _node in _pd_list: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Reload {}, node id: {}.'.format( component, _uuid)) if _uuid == _cluster.pd_leader(): _cluster.evict_pd_leader(uuid=_uuid) self.act.deploy_component(component=component, pattern=pattern, node=_uuid) self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.start_component(component=component, pattern=pattern, node=_uuid) continue if pattern in [ 'monitored_servers', 'monitoring_server', 'grafana_server', 'alertmanager_server' ]: if not node: term.normal('Reload {}.'.format(component)) self.act.deploy_component(component=component, pattern=pattern) self.act.stop_component(component=component, pattern=pattern) self.act.start_component(component=component, pattern=pattern) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.normal('Reload {}, node list: {}.'.format( component, ','.join(_uuid))) self.act.deploy_component(component=component, pattern=pattern, node=','.join(_uuid)) self.act.stop_component(component=component, pattern=pattern, node=','.join(_uuid)) self.act.start_component(component=component, pattern=pattern, node=','.join(_uuid)) continue for _node in _topology[pattern]: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Reload {}, node id: {}.'.format(component, _uuid)) if pattern == 'tikv_servers': _port = _node['port'] _cluster.evict_store_leaders(host=_host, port=_port) self.act.deploy_component(component=component, pattern=pattern, node=_uuid) self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.start_component(component=component, pattern=pattern, node=_uuid) if pattern == 'tikv_servers': _cluster.remove_evict(host=_host, port=_port)