def _post(self, component=None, pattern=None, node=None, role=None): try: utils.remove_dir(utils.profile_path(self.topology.cluster_dir)) except Exception as e: logging.warning(e) term.notice('TiDB cluster destroyed.')
def _post(self, component=None, pattern=None, node=None, role=None): term.notice('Finished reload config for {} cluster.'.format( self.topology.version)) print(term.bold_cyan('Success:')) for host, out in self._result['success'].items(): _output = 'stdout: {}'.format(out['stdout']) if len(out['stderr']) > 0: _output += '\nstderr: {}'.format(out['stderr']) print(term.plain_green('{}:'.format(host))) print(term.plain(_output)) if len(self._result['unreachable']) > 0: print(term.bold_yellow('Unreachable:')) for host, out in self._result['unreachable'].items(): _output = 'stdout: {}'.format(out['stdout']) if len(out['stderr']) > 0: _output += '\nstderr: {}'.format(out['stderr']) print(term.plain_yellow('{}:'.format(host))) print(term.plain(_output)) if len(self._result['failed']) > 0: print(term.bold_red('Failed:')) for host, out in self._result['failed'].items(): _output = 'stdout: {}'.format(out['stdout']) if len(out['stderr']) > 0: _output += '\nstderr: {}'.format(out['stderr']) print(term.plain_red('{}:'.format(host))) print(term.plain(_output))
def _post(self, component=None, pattern=None, node=None, role=None): # if 'pd_servers' in self._diff: # reload_pd = True # else: # reload_pd = False self.topology.replace(self._new_topo) term.info('Update configuration.') ans = ansibleapi.ANSRunner(user=self.topology.user, topology=self.topology._topology( self._new_topo), tiargs=self._args) act = Action(ans=ans, topo=self.topology) if 'pd_servers' in self._diff: act.deploy_component(component='pd', pattern='pd_servers') act.deploy_component(component='tikv', pattern='tikv_servers') act.deploy_component(component='tidb', pattern='tidb_servers') act.deploy_component(component='pump', pattern='pump_servers') act.deploy_component(component='drainer', pattern='drainer_servers') act.deploy_component(component='prometheus', pattern='monitoring_server') act.stop_component(component='prometheus', pattern='monitoring_server') act.start_component(component='prometheus', pattern='monitoring_server') term.notice('Finished scaling out.')
def _prepare(self, component=None, pattern=None, node=None, role=None): if self.topology.version and self._args.tidb_version: new_ver = self._args.tidb_version.lstrip('v') curr_ver = self.topology.version.lstrip('v') _cmp = semver.compare(curr_ver, new_ver) if _cmp > 0: raise exceptions.TiOPSArgumentError( 'Running version is {}, can\'t downgrade.'.format( curr_ver)) term.notice('Begin installing TiDB cluster.') # download packages term.info( 'Downloading TiDB related binary, it may take a few minutes.') try: _local = self._args.local_pkg except AttributeError: _local = None self.act.download(local_pkg=_local) if not self.demo: # edit config self.act.edit_file() term.info('Check ssh connection.') self.act.check_ssh_connection() if self._args.enable_check_config: self._check_config()
def _prepare(self, component=None, pattern=None, node=None, role=None): try: self.cmd = ' '.join(self._args.cmd) except AttributeError: raise exceptions.TiOPSArgumentError( 'No command specified, do nothing.') term.notice('Run raw shell command on {} cluster.'.format( self.topology.cluster_name)) term.normal('{}'.format(self.cmd))
def _post(self, component=None, pattern=None, node=None, role=None): self.topology.set_meta() self.topology._save_topology() if self.demo: term.notice('Finished deploying TiDB cluster {} ({}).'.format( self.topology.cluster_name, self.topology.version)) else: term.notice( 'Finished deploying TiDB cluster {} ({}), don\'t forget to start it.' .format(self.topology.cluster_name, self.topology.version))
def _prepare(self, component=None, pattern=None, node=None, role=None): term.warn('The TiDB cluster {} ({}) is going to be destroyed.'.format( self.topology.cluster_name, self.topology.version)) rm_promt = 'This operation will ' + term.warn_red('remove') \ + ' the TiDB cluster ' + term.highlight_red(self.topology.cluster_name) \ + '. It can NOT be undone. ' + term.yes_no() + ':' notice = term.input(rm_promt) if notice.lower() not in ['y', 'yes']: term.notice('Terminate the destroy operation.') raise exceptions.TiOPSRuntimeError('Operation cancelled by user.')
def _post(self, component=None, pattern=None, node=None, role=None): ans = ansibleapi.ANSRunner(user=self.topology.user, topology=self.topology(), tiargs=self._args) act = Action(ans=ans, topo=self.topology) if 'pd_servers' in self._diff: act.deploy_component(component='pd', pattern='pd_servers') act.deploy_component(component='tikv', pattern='tikv_servers') act.deploy_component(component='tidb', pattern='tidb_servers') act.deploy_component(component='pump', pattern='pump_servers') act.deploy_component(component='drainer', pattern='drainer_servers') # self.deploy.deploy_component(component='prometheus', pattern='monitoring_server', ans=ans) # self.reload.do(component='prometheus', pattern='monitoring_server') term.notice('Finished scaling in.')
def _prepare(self, component=None, pattern=None, node=None, role=None): # check versions before processing self.__check_version() term.notice('Upgrading from v{} to v{}.'.format( self.old_ver, self.new_ver)) # download packages for new version term.info('Downloading TiDB related binary, it may take a few minutes.') try: _local = self._args.local_pkg except AttributeError: _local = None self.act.download(version=self.new_ver, local_pkg=_local) # check configs self.__check_config()
def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Start specified node in cluster.') elif role: term.notice('Start specified role in cluster.') else: term.notice('Start TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) if not self.demo: term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group: component, pattern = self.check_exist(service, config=_topology) if not component and not pattern: continue if not node: term.normal('Starting {}.'.format(component)) self.act.start_component(component, pattern) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.normal('Starting {}, node list: {}.'.format( component, ','.join(_uuid))) self.act.start_component(component, pattern, ','.join(_uuid))
def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Running command on specified node in cluster.') elif role: term.notice('Running command on specified role in cluster.') else: term.notice('Running command on all node in cluster.') _topology = self.topology.role_node(roles=role, nodes=node) try: _sudo = self._args.root except AttributeError: _sudo = False term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group: component, pattern = self.check_exist(service, config=_topology) if not component and not pattern: continue if not node: term.info('Running command on {}.'.format(component)) self.__run(pattern=pattern, sudo=_sudo, cmd=self.cmd) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.info('Running command on {}, node list: {}.'.format( component, ','.join(_uuid))) self.__run(pattern=pattern, node=','.join(_uuid), sudo=_sudo, cmd=self.cmd)
def _prepare(self, component=None, pattern=None, node=None, role=None): if not self._diff: msg = 'No new nodes to scale out.' term.error(msg) raise exceptions.TiOPSConfigError(msg) term.notice('Begin add node for TiDB cluster.') # copy template utils.create_dir(self.topology.cache_template_dir) utils.copy_template(source=os.path.join(self.topology.titemplate_dir), target=os.path.join( self.topology.cache_template_dir)) # update scripts when scale-out. for service in ['pd', 'tikv', 'tidb', 'pump', 'drainer']: if '{}_servers'.format(service) in self._diff: template_path = os.path.join( self.topology.cache_template_dir, 'scripts/run_{}.sh.j2'.format(service)) _original, new_template = utils.script_template( path=self.topology.cluster_dir, template=template_path, service=service) utils.write_template(template_path, new_template)
def init_network(self, demo=False): term.notice( 'Start creating no-password ssh connections between the management machine and other machines' ) self._check_ip_list() # get password from prompt if not self._args.password: term.info( 'Please enter the password of init user on deployment server ({} password)' .format(self.init_user)) _passwd = term.getpass() else: _passwd = self._args.password # create ansible runner initnet = ansibleapi.ANSRunner(ips=self.hosts, user=self.init_user, tiargs=self._args, passwd=_passwd) term.info('Create {} user on remote machines.'.format(self.user)) initnet.run_model('user', 'name=%s ' 'shell=/bin/bash ' 'createhome=yes' % (self.user), become=True) term.info('Set authorized_keys for {} on cluster machine.'.format( self.user)) initnet.run_model('authorized_key', 'user=%s ' 'key={{ lookup("file", "%s/id_rsa.pub") }}' % (self.user, utils.profile_path('.ssh')), become=True) term.info('Add sudo permissions for {} on cluster machine.'.format( self.user)) initnet.run_model('lineinfile', 'path=/etc/sudoers ' 'line="{} ALL=(ALL) NOPASSWD: ALL" ' 'regexp="^{} .*" ' 'insertafter=EOF ' 'state=present'.format(self.user, self.user), become=True) if demo: term.notice('Finished setting up SSH keys.') else: term.notice('Done!!!')
def init(self, demo=False): term.notice('Start init management machine.') key_home = utils.profile_path('.ssh') if not os.path.exists(key_home): utils.create_dir(key_home) os.chmod(os.path.join(key_home), 0o700) if not os.path.isfile(os.path.join(key_home, 'id_rsa')) or \ not os.path.isfile(os.path.join(key_home, 'id_rsa.pub')): term.info('There is not SSH key. Start generating.'.format( getpass.getuser())) os.system( '/usr/bin/ssh-keygen -t rsa -N \'\' -f {}/id_rsa -q'.format( key_home)) else: term.normal('Already have SSH key, skip create.'.format( getpass.getuser())) if demo: term.notice('Finished init management machine.') else: term.notice('Done!!!')
def _post(self, component=None, pattern=None, node=None, role=None): term.notice('Finished start.')
def _post(self, component=None, pattern=None, node=None, role=None): self.topology.set_meta(version=self.new_ver) term.notice('Upgraded to {}.'.format(self.topology.version))
def _prepare(self, component=None, pattern=None, node=None, role=None): term.notice('Begin delete node for TiDB cluster.') self._cluster = modules.ClusterAPI(topology=self.topology) self._pd_status = self._cluster.status() self._tikv_stores = self._cluster.tikv_stores()
if __name__ == '__main__': _parser = cmd.TiOPSParser() args = _parser() # add logging facilities, but outputs are not modified to use it yet if args.verbose: logging.basicConfig( filename=utils.profile_path("tiops.log"), format= '[%(asctime)s] [%(levelname)s] %(message)s (at %(filename)s:%(lineno)d in %(funcName)s).', datefmt='%Y-%m-%d %T %z', level=logging.DEBUG) logging.info("Using logging level: DEBUG.") logging.debug("Debug logging enabled.") logging.debug("Input arguments are: %s" % args) else: logging.basicConfig( filename=utils.profile_path("tiops.log"), format='[%(asctime)s] [%(levelname)s] %(message)s.', datefmt='%Y-%m-%d %T %z', level=logging.INFO) logging.info("Using logging level: INFO.") try: main(args) except KeyboardInterrupt: # not auto cleaning up because the operation may be un-defined term.notice( 'Process interrupted, make sure to check if configs are correct.')
def init_host(self, demo=False): term.notice('Begin initializing the cluster machine.') self._check_ip_list() inithost = ansibleapi.ANSRunner(ips=self.hosts, user=self.user, tiargs=self._args) inithost.run_model('file', 'path=%s ' 'state=directory ' 'owner=%s ' 'group=%s' % (self.check_dir, self.user, self.user), become=True) if not os.path.exists(self.host_vars): os.mkdir(self.host_vars) elif not os.path.isdir(self.host_vars): os.remove(self.host_vars) os.mkdir(self.host_vars) # get host environment vars term.info('Get all host environment vars.') setup = inithost.run_model('setup', 'gather_subset="all" ' 'gather_timeout=120') # record host environment vars for _host, _vars in setup['success'].iteritems(): __vars = json.loads(json.dumps(_vars)) utils.write_yaml( os.path.join(self.host_vars, '{}.yaml'.format(_host)), __vars) if self.enable_checks: term.info('Check OS platform.') self.check_os_platform(setup) term.info('Check OS version.') self.check_os_version(setup) # check cpu if support EPOLLEXCLUSIVE term.info('Check if CPU support EPOLLEXCLUSIVE.') inithost.run_model('copy', 'src=%s/{{ item }} ' 'dest=%s ' 'mode=0755' % (self.script_path, self.check_dir), with_items=['epollexclusive', 'run_epoll.sh']) check_epoll = inithost.run_model( 'shell', '%s/run_epoll.sh' % (self.check_dir)) _unsupport_epoll = [ host for host, info in check_epoll['success'].iteritems() if 'True' not in info['stdout'] ] if _unsupport_epoll: raise exceptions.TiOPSRuntimeError( 'CPU unsupport epollexclusive on {} machine, please change system version or upgrade kernel verion.' .format(','.join(_unsupport_epoll).replace('_', '.')), operation='init') # check systemd service version (systemd package, not operation system) _systemd = inithost.run_model('yum', 'list=systemd', become=True) term.info('Check systemd service version.') self.check_systemd_version(_systemd) if not demo: # check and set cpu mode to performance term.info('Set CPU performance mode if support.') # enable cpupower service inithost.run_model('systemd', 'name="cpupower"', 'state=started', 'enabled=yes') # set timezone if self.timezone: inithost.run_model( 'blockinfile', 'path=/home/%s/.bash_profile ' 'insertbefore="# End of file" ' 'block="export TZ=%s"' % (self.user, self.timezone)) # install ntp package if self.ntp_server: term.info('Install ntpd package.') inithost.run_model('yum', 'name="ntp" ' 'state=present', become=True) term.info('Add ntp servers to ntpd config and reload.') _ntp_config = [ 'server {} iburst'.format(server) for server in self.ntp_server.split(',') ] inithost.run_model('blockinfile', 'path="/etc/ntp.conf" ' 'insertbefore="# End of file" ' 'marker="#{mark} TiDB %s MANAGED BLOCK" ' 'block="%s"' % (self.user, '\n'.join(_ntp_config)), become=True) inithost.run_model('systemd', 'name=ntpd ' 'state=restarted ' 'enabled=yes', become=True) # check if time synchronize if self.enable_check_ntp: term.info('Check if NTP is running to synchronize the time.') result = inithost.run_model( 'shell', 'ntpstat | grep -w synchronised | wc -l') _unrun = [ host for host, info in result['success'].iteritems() if info['stderr'] ] if _unrun: raise exceptions.TiOPSRuntimeError( 'Ntp server may stopped on {} machine.'.format( ','.join(_unrun).replace('_', '.')), operation='init') _unsync = [ host for host, info in result['success'].iteritems() if info['stdout'] == str(0) ] if _unsync: raise exceptions.TiOPSRuntimeError( 'Time unsynchronised on {}, please check ntp status.'. format(','.join(_unsync).replace('_', '.')), operation='init') term.info('Update kernel parameters on cluster machine.') inithost.run_model('sysctl', 'name={{ item.name }} value={{ item.value }}', become=True, with_items=[{ 'name': 'net.ipv4.tcp_tw_recycle', 'value': 0 }, { 'name': 'net.core.somaxconn', "value": 32768 }, { 'name': 'vm.swappiness', "value": 0 }, { 'name': 'net.ipv4.tcp_syncookies', "value": 0 }, { 'name': 'fs.file-max', "value": 1000000 }]) inithost.run_model( 'blockinfile', 'path="/etc/security/limits.conf" ' 'insertbefore="# End of file" ' 'marker="#{mark} TiDB %s MANAGED BLOCK" ' 'block="%s soft nofile 1000000\n' '%s hard nofile 1000000\n' '%s soft stack 10240\n"' % (self.user, self.user, self.user, self.user), become=True) if not self.enable_swap: term.info('Turn off swap on remote machine.') inithost.run_model('shell', 'swapoff -a', become=True) else: term.info('Turn on swap on remote machine') inithost.run_model('shell', 'swapon -a', become=True) # disable selinux term.info('Disable selinux.') inithost.run_model('selinux', 'state=disabled', become=True) # set and start irqbalance if not demo and not self.disable_irqbalance: term.info('Set and start irqbalance.') inithost.run_model('lineinfile', 'dest=/etc/sysconfig/irqbalance ' 'regexp="(?<!_)ONESHOT=" ' 'line="ONESHOT=yes"', become=True) inithost.run_model('systemd', 'name=irqbalance.service ' 'state=restarted ' 'enabled=yes', become=True) if demo: term.notice('Finished init deployment machine.') else: term.notice('Done!!!')
def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Reload specified node in cluster.') elif role: term.notice('Reload specified role in cluster.') else: term.notice('Reload TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) _cluster = modules.ClusterAPI(topology=self.topology) _unhealth_node = [] for _pd_node in _cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='reload') term.info('Check ssh connection.') self.act.check_ssh_connection() # every time should only contain one item for service in self.topology.service_group: component, pattern = self.check_exist(service=service, config=_topology) if not component and not pattern: continue # upgrade pd server, upgrade leader node finally if component == 'pd': _pd_list = [] for _node in _topology[pattern]: if _node['uuid'] == _cluster.pd_leader(): _leader = _node else: _pd_list.append(_node) _pd_list.append(_leader) for _node in _pd_list: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Reload {}, node id: {}.'.format( component, _uuid)) if _uuid == _cluster.pd_leader(): _cluster.evict_pd_leader(uuid=_uuid) self.act.deploy_component(component=component, pattern=pattern, node=_uuid) self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.start_component(component=component, pattern=pattern, node=_uuid) continue if pattern in [ 'monitored_servers', 'monitoring_server', 'grafana_server', 'alertmanager_server' ]: if not node: term.normal('Reload {}.'.format(component)) self.act.deploy_component(component=component, pattern=pattern) self.act.stop_component(component=component, pattern=pattern) self.act.start_component(component=component, pattern=pattern) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.normal('Reload {}, node list: {}.'.format( component, ','.join(_uuid))) self.act.deploy_component(component=component, pattern=pattern, node=','.join(_uuid)) self.act.stop_component(component=component, pattern=pattern, node=','.join(_uuid)) self.act.start_component(component=component, pattern=pattern, node=','.join(_uuid)) continue for _node in _topology[pattern]: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Reload {}, node id: {}.'.format(component, _uuid)) if pattern == 'tikv_servers': _port = _node['port'] _cluster.evict_store_leaders(host=_host, port=_port) self.act.deploy_component(component=component, pattern=pattern, node=_uuid) self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.start_component(component=component, pattern=pattern, node=_uuid) if pattern == 'tikv_servers': _cluster.remove_evict(host=_host, port=_port)
def _post(self, component=None, pattern=None, node=None, role=None): term.notice('Finished reload config for {} cluster.'.format( self.topology.version))