def __init__(self, args=None, topology=None): super(OprExec, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) self._result = { 'failed': {}, 'success': {}, 'unreachable': {}, }
def __init__(self, args=None, topology=None, new_srvs=None): if os.path.exists(topology.topology_file): term.warn( 'Check TiDB cluster {} status, it may take a few minutes.'. format(topology.cluster_name)) self.check_tombstone(topology, args) self._new_topo, self._diff = topology.add(new_srvs) topology.replace(self._new_topo, write=False) super(OprScaleOut, self).__init__(args, topology, action='deploy') self.act = Action(ans=self.ans, topo=self.topology)
def __init__(self, args=None, topology=None, node=None): if not node: msg = 'Node ID not specified.' term.error(msg) raise exceptions.TiOPSConfigError(msg) self._new_topo, self._diff = topology.remove(node) super(OprScaleIn, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology)
def __init__(self, args=None, topology=None): super(OprUpgrade, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) try: self.arg_ver = args.tidb_version except AttributeError: raise exceptions.TiOPSConfigError( '--tidb-version is not set when upgrade, abort.') try: self.force = args.force except AttributeError: self.force = False
def _post(self, component=None, pattern=None, node=None, role=None): ans = ansibleapi.ANSRunner(user=self.topology.user, topology=self.topology(), tiargs=self._args) act = Action(ans=ans, topo=self.topology) if 'pd_servers' in self._diff: act.deploy_component(component='pd', pattern='pd_servers') act.deploy_component(component='tikv', pattern='tikv_servers') act.deploy_component(component='tidb', pattern='tidb_servers') act.deploy_component(component='pump', pattern='pump_servers') act.deploy_component(component='drainer', pattern='drainer_servers') # self.deploy.deploy_component(component='prometheus', pattern='monitoring_server', ans=ans) # self.reload.do(component='prometheus', pattern='monitoring_server') term.notice('Finished scaling in.')
class OprRestart(OperationBase): def __init__(self, args=None, topology=None): super(OprRestart, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Restart specified node in cluster.') elif role: term.notice('Restart specified role in cluster.') else: term.notice('Restart TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, config=_topology) if not component and not pattern: continue if not node: term.normal('Stopping {}.'.format(component)) self.act.stop_component(component, pattern) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.normal('Stopping {}, node list: {}.'.format( component, ','.join(_uuid))) self.act.stop_component(component, pattern, ','.join(_uuid)) for service in self.topology.service_group: component, pattern = self.check_exist(service, config=_topology) if not component and not pattern: continue if not node: term.normal('Starting {}.'.format(component)) self.act.start_component(component, pattern) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.normal('Starting {}, node list: {}.'.format( component, ','.join(_uuid))) self.act.start_component(component, pattern, ','.join(_uuid)) def _post(self, component=None, pattern=None, node=None, role=None): term.notice('Finished restart.')
def _post(self, component=None, pattern=None, node=None, role=None): # if 'pd_servers' in self._diff: # reload_pd = True # else: # reload_pd = False self.topology.replace(self._new_topo) term.info('Update configuration.') ans = ansibleapi.ANSRunner(user=self.topology.user, topology=self.topology._topology( self._new_topo), tiargs=self._args) act = Action(ans=ans, topo=self.topology) if 'pd_servers' in self._diff: act.deploy_component(component='pd', pattern='pd_servers') act.deploy_component(component='tikv', pattern='tikv_servers') act.deploy_component(component='tidb', pattern='tidb_servers') act.deploy_component(component='pump', pattern='pump_servers') act.deploy_component(component='drainer', pattern='drainer_servers') act.deploy_component(component='prometheus', pattern='monitoring_server') act.stop_component(component='prometheus', pattern='monitoring_server') act.start_component(component='prometheus', pattern='monitoring_server') term.notice('Finished scaling out.')
class OprScaleOut(OperationBase): def __init__(self, args=None, topology=None, new_srvs=None): if os.path.exists(topology.topology_file): term.warn( 'Check TiDB cluster {} status, it may take a few minutes.'. format(topology.cluster_name)) self.check_tombstone(topology, args) self._new_topo, self._diff = topology.add(new_srvs) topology.replace(self._new_topo, write=False) super(OprScaleOut, self).__init__(args, topology, action='deploy') self.act = Action(ans=self.ans, topo=self.topology) def _prepare(self, component=None, pattern=None, node=None, role=None): if not self._diff: msg = 'No new nodes to scale out.' term.error(msg) raise exceptions.TiOPSConfigError(msg) term.notice('Begin add node for TiDB cluster.') # copy template utils.create_dir(self.topology.cache_template_dir) utils.copy_template(source=os.path.join(self.topology.titemplate_dir), target=os.path.join( self.topology.cache_template_dir)) # update scripts when scale-out. for service in ['pd', 'tikv', 'tidb', 'pump', 'drainer']: if '{}_servers'.format(service) in self._diff: template_path = os.path.join( self.topology.cache_template_dir, 'scripts/run_{}.sh.j2'.format(service)) _original, new_template = utils.script_template( path=self.topology.cluster_dir, template=template_path, service=service) utils.write_template(template_path, new_template) def _process(self, component=None, pattern=None, node=None, role=None): term.info('Check ssh connection.') self.act.check_ssh_connection() self.act.edit_file() try: term.info('Create directory in all add nodes.') for service in self.topology.service_group: component, pattern = self.check_exist(service, self._diff) if not component and not pattern: continue uuid = [x['uuid'] for x in self._diff[pattern]] self.act.create_directory(component=component, pattern=pattern, node=','.join(uuid)) # check machine cpu / memory / disk self.act.check_machine_config(self._diff) # start run scale-out for service in self.topology.service_group: component, pattern = self.check_exist(service, self._diff) if not component and not pattern: continue uuid = [x['uuid'] for x in self._diff[pattern]] term.normal('Add {}, node list: {}.'.format( component, ','.join(uuid))) _template_dir = self.topology.cache_template_dir self.act.deploy_component(component=component, pattern=pattern, node=','.join(uuid), template_dir=_template_dir) self.act.deploy_firewall(component=component, pattern=pattern, node=','.join(uuid)) self.act.start_component(component=component, pattern=pattern, node=','.join(uuid)) finally: os.popen('rm -rf {}'.format(self.topology.cache_template_dir)) def _post(self, component=None, pattern=None, node=None, role=None): # if 'pd_servers' in self._diff: # reload_pd = True # else: # reload_pd = False self.topology.replace(self._new_topo) term.info('Update configuration.') ans = ansibleapi.ANSRunner(user=self.topology.user, topology=self.topology._topology( self._new_topo), tiargs=self._args) act = Action(ans=ans, topo=self.topology) if 'pd_servers' in self._diff: act.deploy_component(component='pd', pattern='pd_servers') act.deploy_component(component='tikv', pattern='tikv_servers') act.deploy_component(component='tidb', pattern='tidb_servers') act.deploy_component(component='pump', pattern='pump_servers') act.deploy_component(component='drainer', pattern='drainer_servers') act.deploy_component(component='prometheus', pattern='monitoring_server') act.stop_component(component='prometheus', pattern='monitoring_server') act.start_component(component='prometheus', pattern='monitoring_server') term.notice('Finished scaling out.')
class OprScaleIn(OperationBase): def __init__(self, args=None, topology=None, node=None): if not node: msg = 'Node ID not specified.' term.error(msg) raise exceptions.TiOPSConfigError(msg) self._new_topo, self._diff = topology.remove(node) super(OprScaleIn, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) def _prepare(self, component=None, pattern=None, node=None, role=None): term.notice('Begin delete node for TiDB cluster.') self._cluster = modules.ClusterAPI(topology=self.topology) self._pd_status = self._cluster.status() self._tikv_stores = self._cluster.tikv_stores() def _process(self, component=None, pattern=None, node=None, role=None): _unhealth_node = [] for _pd_node in self._cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='scaleIn') _current_pd_num = len(self._pd_status) _current_tikv_num = len(self._tikv_stores) if 'pd_servers' in self._diff and len( self._diff['pd_servers']) == _current_pd_num: term.fatal('Can not delete all pd node.') exit(1) if 'tikv_servers' in self._diff and len( self._diff['tikv_servers']) == _current_tikv_num: term.fatal('Can not delete all tikv node.') exit(1) term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, self._diff) if not component and not pattern: continue uuid = [x['uuid'] for x in self._diff[pattern]] term.normal('Delete {}, node list: {}'.format( component, ','.join(uuid))) for _uuid in uuid: self.__delete_component(self._diff, component, pattern, _uuid) if component not in ['tikv', 'pump', 'drainer']: self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.destroy_component(component=component, pattern=pattern, node=_uuid) if component != 'blackbox_exporter': self.topology.replace(self.topology.remove(_uuid)[0]) def _post(self, component=None, pattern=None, node=None, role=None): ans = ansibleapi.ANSRunner(user=self.topology.user, topology=self.topology(), tiargs=self._args) act = Action(ans=ans, topo=self.topology) if 'pd_servers' in self._diff: act.deploy_component(component='pd', pattern='pd_servers') act.deploy_component(component='tikv', pattern='tikv_servers') act.deploy_component(component='tidb', pattern='tidb_servers') act.deploy_component(component='pump', pattern='pump_servers') act.deploy_component(component='drainer', pattern='drainer_servers') # self.deploy.deploy_component(component='prometheus', pattern='monitoring_server', ans=ans) # self.reload.do(component='prometheus', pattern='monitoring_server') term.notice('Finished scaling in.') def __delete_component(self, config=None, component=None, pattern=None, uuid=None): if component == 'pd': try: self._cluster.del_pd(uuid) except exceptions.TiOPSException as e: term.fatal( 'Unable to delete PD node from cluster: {}'.format(e)) exit(1) if component == 'tikv': _tikv_info = '' for _tikv_node in config[pattern]: if _tikv_node['uuid'] != uuid: continue if _tikv_node['offline']: return _tikv_info = _tikv_node for ctikv in self._tikv_stores['stores']: # check if node in cluster if '{}:{}'.format( _tikv_info['ip'], _tikv_info['port']) == ctikv['store']['address']: _store_id = ctikv['store']['id'] # delete store through api try: self._cluster.del_store(_store_id) except exceptions.TiOPSException as e: term.fatal('Unable to delete store: {}'.format(e)) exit(1) if component == 'drainer': _binlog = modules.BinlogAPI(topology=self.topology) _binlog.delete_drainer(node_id=uuid) if component == 'pump': _binlog = modules.BinlogAPI(topology=self.topology) _binlog.delete_pump(node_id=uuid)
class OprUpgrade(OprDeploy): def __init__(self, args=None, topology=None): super(OprUpgrade, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) try: self.arg_ver = args.tidb_version except AttributeError: raise exceptions.TiOPSConfigError( '--tidb-version is not set when upgrade, abort.') try: self.force = args.force except AttributeError: self.force = False # check versions, it update version related variables in memory, but not writting them to disk def __check_version(self): new_ver = self.arg_ver.lstrip('v') curr_ver = self.topology.version.lstrip('v') _cmp = semver.compare(curr_ver, new_ver) if _cmp == 0: raise exceptions.TiOPSArgumentError( 'Already running version {}.'.format(curr_ver)) elif _cmp > 0: raise exceptions.TiOPSRuntimeError( 'Downgrade is not supported, keep running {}.'.format(curr_ver), operation='upgrade') # update version and related variables self.old_ver = curr_ver self.new_ver = new_ver self.topology.version = 'v{}'.format(new_ver) self.topology.tiversion_dir = os.path.join( self.topology.tidown_dir, '{}'.format(self.topology.version)) self.topology.resource_dir = utils.profile_path( 'downloads', '{}/resources'.format(self.topology.version)) self.topology.dashboard_dir = utils.profile_path( 'downloads', '{}/dashboards'.format(self.topology.version)) self.topology.package_dir = utils.profile_path( 'downloads', '{}/packages'.format(self.topology.version)) self.topology.config_dir = utils.profile_path( 'downloads', '{}/configs'.format(self.topology.version)) # Check if the configuration of the tidb component is reasonable def _check_config(self, topology=None): if not topology: topology = self.topology() _servers = [ {'pd': 'pd_servers'}, {'tikv': 'tikv_servers'}, {'tidb': 'tidb_servers'}, ] for _service in _servers: _component, _pattern = self.check_exist( _service, config=topology) if not _component and not _pattern: continue term.info('Check {} configuration.'.format(_component)) self.act.configCheck(component=_component, pattern=_pattern, node=topology[_pattern][0]['uuid']) # TODO: check and merge configs def __check_config(self): pass def _prepare(self, component=None, pattern=None, node=None, role=None): # check versions before processing self.__check_version() term.notice('Upgrading from v{} to v{}.'.format( self.old_ver, self.new_ver)) # download packages for new version term.info('Downloading TiDB related binary, it may take a few minutes.') try: _local = self._args.local_pkg except AttributeError: _local = None self.act.download(version=self.new_ver, local_pkg=_local) # check configs self.__check_config() def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Upgrade specified node in cluster.') elif role: term.notice('Upgrade specified role in cluster.') else: term.notice('Upgrade TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) if self._args.enable_check_config: self._check_config() # for service in ['pd', 'tikv', 'pump', 'tidb']: # grp = [x for x in self.topology.service_group if service in x.keys()] _cluster = modules.ClusterAPI(topology=self.topology) _unhealth_node = [] for _pd_node in _cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='upgrade') term.info('Check ssh connection.') self.act.check_ssh_connection() if self.force: for service in self.topology.service_group: component, pattern = self.check_exist( service=service, config=_topology) if not component and not pattern: continue if pattern in ['monitored_servers', 'monitoring_server', 'grafana_server', 'alertmanager_server']: term.normal('Upgrade {}.'.format(component)) self.act.deploy_component( component=component, pattern=pattern) self.act.stop_component( component=component, pattern=pattern) self.act.start_component( component=component, pattern=pattern) continue for _node in _topology[pattern]: _uuid = _node['uuid'] term.normal('Upgrade {}, node id: {}.'.format( component, _uuid)) self.act.deploy_component( component=component, pattern=pattern, node=_uuid) self.act.stop_component( component=component, pattern=pattern, node=_uuid) self.act.start_component( component=component, pattern=pattern, node=_uuid) return # every time should only contain one item for service in self.topology.service_group: component, pattern = self.check_exist( service=service, config=_topology) if not component and not pattern: continue # upgrade pd server, upgrade leader node finally if component == 'pd': _pd_list = [] for _node in _topology[pattern]: if _node['uuid'] == _cluster.pd_leader(): _leader = _node else: _pd_list.append(_node) _pd_list.append(_leader) for _node in _pd_list: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Upgrade {}, node id: {}.'.format( component, _uuid)) if _uuid == _cluster.pd_leader(): _cluster.evict_pd_leader(uuid=_uuid) self.act.deploy_component( component=component, pattern=pattern, node=_uuid) self.act.stop_component( component=component, pattern=pattern, node=_uuid) self.act.start_component( component=component, pattern=pattern, node=_uuid) continue if pattern in ['monitored_servers', 'monitoring_server', 'grafana_server', 'alertmanager_server']: term.normal('Upgrade {}.'.format(component)) self.act.deploy_component(component=component, pattern=pattern) self.act.stop_component(component=component, pattern=pattern) self.act.start_component(component=component, pattern=pattern) continue for _node in _topology[pattern]: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Upgrade {}, node id: {}.'.format(component, _uuid)) if pattern == 'tikv_servers': _port = _node['port'] _cluster.evict_store_leaders(host=_host, port=_port) self.act.deploy_component( component=component, pattern=pattern, node=_uuid) self.act.stop_component( component=component, pattern=pattern, node=_uuid) self.act.start_component( component=component, pattern=pattern, node=_uuid) if pattern == 'tikv_servers': _cluster.remove_evict(host=_host, port=_port) def _post(self, component=None, pattern=None, node=None, role=None): self.topology.set_meta(version=self.new_ver) term.notice('Upgraded to {}.'.format(self.topology.version))
class OprExec(OperationBase): def __init__(self, args=None, topology=None): super(OprExec, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) self._result = { 'failed': {}, 'success': {}, 'unreachable': {}, } def _prepare(self, component=None, pattern=None, node=None, role=None): try: self.cmd = ' '.join(self._args.cmd) except AttributeError: raise exceptions.TiOPSArgumentError( 'No command specified, do nothing.') term.notice('Run raw shell command on {} cluster.'.format( self.topology.cluster_name)) term.normal('{}'.format(self.cmd)) def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Running command on specified node in cluster.') elif role: term.notice('Running command on specified role in cluster.') else: term.notice('Running command on all node in cluster.') _topology = self.topology.role_node(roles=role, nodes=node) try: _sudo = self._args.root except AttributeError: _sudo = False term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group: component, pattern = self.check_exist(service, config=_topology) if not component and not pattern: continue if not node: term.info('Running command on {}.'.format(component)) self.__run(pattern=pattern, sudo=_sudo, cmd=self.cmd) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.info('Running command on {}, node list: {}.'.format( component, ','.join(_uuid))) self.__run(pattern=pattern, node=','.join(_uuid), sudo=_sudo, cmd=self.cmd) def _post(self, component=None, pattern=None, node=None, role=None): term.notice('Finished reload config for {} cluster.'.format( self.topology.version)) print(term.bold_cyan('Success:')) for host, out in self._result['success'].items(): _output = 'stdout: {}'.format(out['stdout']) if len(out['stderr']) > 0: _output += '\nstderr: {}'.format(out['stderr']) print(term.plain_green('{}:'.format(host))) print(term.plain(_output)) if len(self._result['unreachable']) > 0: print(term.bold_yellow('Unreachable:')) for host, out in self._result['unreachable'].items(): _output = 'stdout: {}'.format(out['stdout']) if len(out['stderr']) > 0: _output += '\nstderr: {}'.format(out['stderr']) print(term.plain_yellow('{}:'.format(host))) print(term.plain(_output)) if len(self._result['failed']) > 0: print(term.bold_red('Failed:')) for host, out in self._result['failed'].items(): _output = 'stdout: {}'.format(out['stdout']) if len(out['stderr']) > 0: _output += '\nstderr: {}'.format(out['stderr']) print(term.plain_red('{}:'.format(host))) print(term.plain(_output)) def __run(self, pattern=None, node=None, sudo=False, cmd=None): try: _result = self.act.run_shell(pattern=pattern, node=node, sudo=sudo, cmd=cmd) except exceptions.TiOPSRuntimeError as e: term.error('Error execute command: {}'.format(e)) _result = e.ctx for host, out in _result['success'].items(): if not host in self._result['success'].keys(): self._result['success'][host] = out for host, out in _result['failed'].items(): if not host in self._result['failed'].keys(): self._result['failed'][host] = out for host, out in _result['unreachable'].items(): if not host in self._result['unreachable'].keys(): self._result['unreachable'][host] = out
def main(args=None): try: action = args.action except AttributeError: pass if action == 'version': print(term.plain(TiOPSVer())) exit(0) if action == 'quickdeploy': term.warn( 'The quick deploy mode is for demo and testing, do NOT use in production!' ) # do init _init = init.Init(args) try: _init.init(demo=True) _init.init_network(demo=True) _init.init_host(demo=True) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) # do deploy topo = topology.Topology(args=args, merge=True) try: op.OprDeploy(args, topo, demo=True).do() op.OprStart(args, topo, demo=True).do() tm.TUIModule(topo, args=args).display() except TiOPSRuntimeError as e: tierror(e) except TiOPSRequestError as e: msg = "{}, URL {} returned {}, please check the network and try again.".format( e.msg, e.url, e.code) term.error(msg) sys.exit(1) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'bootstrap-local': _init = init.Init(args) try: _init.init() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'bootstrap-ssh': _init = init.Init(args) try: _init.init_network() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'bootstrap-host': _init = init.Init(args) try: _init.init_host() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) else: try: if action not in ['deploy', 'display']: topo = topology.Topology(args) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) if action == 'display': try: _cluster_name = args.cluster_name except AttributeError: _cluster_name = None try: if _cluster_name and len(_cluster_name) > 0: topo = topology.Topology(args) _list = False else: topo = None _list = True tm.TUIModule(topo, args=args).display(_list) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'deploy': topo = topology.Topology(args=args, merge=True) try: op.OprDeploy(args, topo).do() tm.TUIModule(topo, args=args).display() except TiOPSRuntimeError as e: tierror(e) except TiOPSRequestError as e: msg = "{}, URL {} returned {}, please check the network and try again.".format( e.msg, e.url, e.code) term.error(msg) sys.exit(1) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'start': try: op.OprStart(args, topo).do(node=args.node_id, role=args.role) tm.TUIModule(topo, args=args, status=True).display() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'stop': try: op.OprStop(args, topo).do(node=args.node_id, role=args.role) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'restart': try: op.OprRestart(args, topo).do(node=args.node_id, role=args.role) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'reload': try: op.OprReload(args, topo).do(node=args.node_id, role=args.role) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'upgrade': try: op.OprUpgrade(args, topo).do(node=args.node_id, role=args.role) except TiOPSRuntimeError as e: tierror(e) except TiOPSRequestError as e: msg = "{}, URL {} returned {}, please check the network and try again.".format( e.msg, e.url, e.code) term.error(msg) sys.exit(1) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'destroy': try: op.OprDestroy(args, topo).do() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'edit-config': try: Action(topo=topo).edit_file() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'scale-out': addTopo = utils.read_yaml(args.topology) try: op.OprScaleOut(args, topo, addTopo).do() except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'scale-in': try: op.OprScaleIn(args, topo, args.node_id).do(node=args.node_id) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1) elif action == 'exec': try: op.OprExec(args, topo).do(node=args.node_id, role=args.role) except TiOPSRuntimeError as e: tierror(e) except TiOPSException as e: term.debug(traceback.format_exc()) term.fatal(str(e)) sys.exit(1)
class OprDeploy(OperationBase): def __init__(self, args=None, topology=None, demo=False): super(OprDeploy, self).__init__(args, topology, demo=demo) self.act = Action(ans=self.ans, topo=self.topology) self.demo = demo def _check_config(self): _servers = [ { 'pd': 'pd_servers' }, { 'tikv': 'tikv_servers' }, { 'tidb': 'tidb_servers' }, ] for _service in _servers: _component, _pattern = self.check_exist(_service, config=self.topology()) if not _component and not _pattern: continue term.normal('Check {} configuration.'.format(_component)) self.act.configCheck(component=_component, pattern=_pattern, node=self.topology()[_pattern][0]['uuid']) def _prepare(self, component=None, pattern=None, node=None, role=None): if self.topology.version and self._args.tidb_version: new_ver = self._args.tidb_version.lstrip('v') curr_ver = self.topology.version.lstrip('v') _cmp = semver.compare(curr_ver, new_ver) if _cmp > 0: raise exceptions.TiOPSArgumentError( 'Running version is {}, can\'t downgrade.'.format( curr_ver)) term.notice('Begin installing TiDB cluster.') # download packages term.info( 'Downloading TiDB related binary, it may take a few minutes.') try: _local = self._args.local_pkg except AttributeError: _local = None self.act.download(local_pkg=_local) if not self.demo: # edit config self.act.edit_file() term.info('Check ssh connection.') self.act.check_ssh_connection() if self._args.enable_check_config: self._check_config() def _process(self, component=None, pattern=None, node=None, role=None): # creart directory term.info('Create directory in all nodes.') for service in self.topology.service_group: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue self.act.create_directory(component=component, pattern=pattern) if not self.demo: self.act.check_machine_config() # start run deploy if self.demo: term.warn( 'FirewallD is being disabled on deployment machines in quick deploy mode.' ) for service in self.topology.service_group: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue term.normal('Deploy {}.'.format(component)) self.act.deploy_component(component=component, pattern=pattern) self.act.deploy_firewall(component=component, pattern=pattern) if not self.demo: self.act.deploy_tool() def _post(self, component=None, pattern=None, node=None, role=None): self.topology.set_meta() self.topology._save_topology() if self.demo: term.notice('Finished deploying TiDB cluster {} ({}).'.format( self.topology.cluster_name, self.topology.version)) else: term.notice( 'Finished deploying TiDB cluster {} ({}), don\'t forget to start it.' .format(self.topology.cluster_name, self.topology.version))
def check_tombstone(self, topology=None, args=None): if not topology: topology = self.topology if not args: args = self._args _remove_uuid = [] _cluster = ClusterAPI(topology) _binlog = BinlogAPI(topology) if _cluster.tikv_stores() and _cluster.tikv_tombstone(): # get tombstone tikv node for _node in topology()['tikv_servers']: _tombstone = False if not _node['offline']: continue # online tikv node list _online_list = [ x['store']['address'] for x in _cluster.tikv_stores()['stores'] ] # tombstone status tikv list _tombstone_list = [ x['store']['address'] for x in _cluster.tikv_tombstone()['stores'] ] _address = '{}:{}'.format(_node['ip'], _node['port']) # if node is online, skip it if _address in _online_list: continue # if node is tombstone, will delete it from topology elif _address in _tombstone_list: _remove_uuid.append(_node['uuid']) if _binlog.pump_status: # get tombstone pump node for _node in topology()['pump_servers']: _tombstone = False if not _node['offline']: continue _online_list = [ x['nodeId'] for x in _binlog.pump_status['status'].itervalues() if x['state'] != 'offline' ] _tombstone_list = [ x['nodeId'] for x in _binlog.pump_status['status'].itervalues() if x['state'] == 'offline' ] if _node['uuid'] in _online_list: continue elif _node['uuid'] in _tombstone_list: _remove_uuid.append(_node['uuid']) for _node in topology()['drainer_servers']: _tombstone = False if not _node['offline']: continue _online_list = [ x['nodeId'] for x in _binlog.drainer_status if x['state'] != 'offline' ] _tombstone_list = [ x['nodeId'] for x in _binlog.drainer_status if x['state'] == 'offline' ] if _node['uuid'] in _online_list: continue elif _node['uuid'] in _tombstone_list: _remove_uuid.append(_node['uuid']) if not _remove_uuid: return _new_topo, _diff = topology.remove(','.join(_remove_uuid), delete=True) ans = ansibleapi.ANSRunner(user=topology.user, topology=_diff, tiargs=args) act = Action(ans=ans, topo=topology) for service in [{ 'drainer': 'drainer_servers' }, { 'pump': 'pump_servers' }, { 'tikv': 'tikv_servers' }]: component, pattern = self.check_exist(service, _diff) if not component and not pattern: continue act.stop_component(component=component, pattern=pattern) act.destroy_component(component=component, pattern=pattern) topology.replace(_new_topo)
class OprReload(OperationBase): def __init__(self, args=None, topology=None): super(OprReload, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Reload specified node in cluster.') elif role: term.notice('Reload specified role in cluster.') else: term.notice('Reload TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) _cluster = modules.ClusterAPI(topology=self.topology) _unhealth_node = [] for _pd_node in _cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='reload') term.info('Check ssh connection.') self.act.check_ssh_connection() # every time should only contain one item for service in self.topology.service_group: component, pattern = self.check_exist(service=service, config=_topology) if not component and not pattern: continue # upgrade pd server, upgrade leader node finally if component == 'pd': _pd_list = [] for _node in _topology[pattern]: if _node['uuid'] == _cluster.pd_leader(): _leader = _node else: _pd_list.append(_node) _pd_list.append(_leader) for _node in _pd_list: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Reload {}, node id: {}.'.format( component, _uuid)) if _uuid == _cluster.pd_leader(): _cluster.evict_pd_leader(uuid=_uuid) self.act.deploy_component(component=component, pattern=pattern, node=_uuid) self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.start_component(component=component, pattern=pattern, node=_uuid) continue if pattern in [ 'monitored_servers', 'monitoring_server', 'grafana_server', 'alertmanager_server' ]: if not node: term.normal('Reload {}.'.format(component)) self.act.deploy_component(component=component, pattern=pattern) self.act.stop_component(component=component, pattern=pattern) self.act.start_component(component=component, pattern=pattern) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.normal('Reload {}, node list: {}.'.format( component, ','.join(_uuid))) self.act.deploy_component(component=component, pattern=pattern, node=','.join(_uuid)) self.act.stop_component(component=component, pattern=pattern, node=','.join(_uuid)) self.act.start_component(component=component, pattern=pattern, node=','.join(_uuid)) continue for _node in _topology[pattern]: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Reload {}, node id: {}.'.format(component, _uuid)) if pattern == 'tikv_servers': _port = _node['port'] _cluster.evict_store_leaders(host=_host, port=_port) self.act.deploy_component(component=component, pattern=pattern, node=_uuid) self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.start_component(component=component, pattern=pattern, node=_uuid) if pattern == 'tikv_servers': _cluster.remove_evict(host=_host, port=_port) def _post(self, component=None, pattern=None, node=None, role=None): term.notice('Finished reload config for {} cluster.'.format( self.topology.version))
def __init__(self, args=None, topology=None): super(OprStop, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology)
def __init__(self, args=None, topology=None, demo=False): super(OprStart, self).__init__(args, topology, demo=demo) self.act = Action(ans=self.ans, topo=self.topology) self.demo = demo
class OprDestroy(OperationBase): def __init__(self, args=None, topology=None): super(OprDestroy, self).__init__(args, topology) self.act = Action(ans=self.ans, topo=self.topology) def _prepare(self, component=None, pattern=None, node=None, role=None): term.warn('The TiDB cluster {} ({}) is going to be destroyed.'.format( self.topology.cluster_name, self.topology.version)) rm_promt = 'This operation will ' + term.warn_red('remove') \ + ' the TiDB cluster ' + term.highlight_red(self.topology.cluster_name) \ + '. It can NOT be undone. ' + term.yes_no() + ':' notice = term.input(rm_promt) if notice.lower() not in ['y', 'yes']: term.notice('Terminate the destroy operation.') raise exceptions.TiOPSRuntimeError('Operation cancelled by user.') def _process(self, component=None, pattern=None, node=None, role=None): term.info('Check ssh connection.') self.act.check_ssh_connection() term.info('Stopping TiDB cluster.') for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue try: self.act.stop_component(component=component, pattern=pattern, node=node) except exceptions.TiOPSWarning as e: term.debug(str(e)) pass for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue term.normal('{} is being destroyed.'.format(component)) try: self.act.destroy_component(component=component, pattern=pattern, node=node) except exceptions.TiOPSWarning as e: term.debug(str(e)) pass # remove deploy dir self.ans.run_model('shell', 'rm -rf {{ full_deploy_dir | cluster_dir }}', become=True, group='*') self.ans.run_model('shell', 'rm -rf {{ full_data_dir | cluster_dir }}', become=True, group='*') def _post(self, component=None, pattern=None, node=None, role=None): try: utils.remove_dir(utils.profile_path(self.topology.cluster_dir)) except Exception as e: logging.warning(e) term.notice('TiDB cluster destroyed.')