def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Start specified node in cluster.') elif role: term.notice('Start specified role in cluster.') else: term.notice('Start TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) if not self.demo: term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group: component, pattern = self.check_exist(service, config=_topology) if not component and not pattern: continue if not node: term.normal('Starting {}.'.format(component)) self.act.start_component(component, pattern) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.normal('Starting {}, node list: {}.'.format( component, ','.join(_uuid))) self.act.start_component(component, pattern, ','.join(_uuid))
def _prepare(self, component=None, pattern=None, node=None, role=None): if self.topology.version and self._args.tidb_version: new_ver = self._args.tidb_version.lstrip('v') curr_ver = self.topology.version.lstrip('v') _cmp = semver.compare(curr_ver, new_ver) if _cmp > 0: raise exceptions.TiOPSArgumentError( 'Running version is {}, can\'t downgrade.'.format( curr_ver)) term.notice('Begin installing TiDB cluster.') # download packages term.info( 'Downloading TiDB related binary, it may take a few minutes.') try: _local = self._args.local_pkg except AttributeError: _local = None self.act.download(local_pkg=_local) if not self.demo: # edit config self.act.edit_file() term.info('Check ssh connection.') self.act.check_ssh_connection() if self._args.enable_check_config: self._check_config()
def _post(self, component=None, pattern=None, node=None, role=None): # if 'pd_servers' in self._diff: # reload_pd = True # else: # reload_pd = False self.topology.replace(self._new_topo) term.info('Update configuration.') ans = ansibleapi.ANSRunner(user=self.topology.user, topology=self.topology._topology( self._new_topo), tiargs=self._args) act = Action(ans=ans, topo=self.topology) if 'pd_servers' in self._diff: act.deploy_component(component='pd', pattern='pd_servers') act.deploy_component(component='tikv', pattern='tikv_servers') act.deploy_component(component='tidb', pattern='tidb_servers') act.deploy_component(component='pump', pattern='pump_servers') act.deploy_component(component='drainer', pattern='drainer_servers') act.deploy_component(component='prometheus', pattern='monitoring_server') act.stop_component(component='prometheus', pattern='monitoring_server') act.start_component(component='prometheus', pattern='monitoring_server') term.notice('Finished scaling out.')
def _process(self, component=None, pattern=None, node=None, role=None): # creart directory term.info('Create directory in all nodes.') for service in self.topology.service_group: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue self.act.create_directory(component=component, pattern=pattern) if not self.demo: self.act.check_machine_config() # start run deploy if self.demo: term.warn( 'FirewallD is being disabled on deployment machines in quick deploy mode.' ) for service in self.topology.service_group: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue term.normal('Deploy {}.'.format(component)) self.act.deploy_component(component=component, pattern=pattern) self.act.deploy_firewall(component=component, pattern=pattern) if not self.demo: self.act.deploy_tool()
def _check_config(self, topology=None): if not topology: topology = self.topology() _servers = [ {'pd': 'pd_servers'}, {'tikv': 'tikv_servers'}, {'tidb': 'tidb_servers'}, ] for _service in _servers: _component, _pattern = self.check_exist( _service, config=topology) if not _component and not _pattern: continue term.info('Check {} configuration.'.format(_component)) self.act.configCheck(component=_component, pattern=_pattern, node=topology[_pattern][0]['uuid'])
def _prepare(self, component=None, pattern=None, node=None, role=None): # check versions before processing self.__check_version() term.notice('Upgrading from v{} to v{}.'.format( self.old_ver, self.new_ver)) # download packages for new version term.info('Downloading TiDB related binary, it may take a few minutes.') try: _local = self._args.local_pkg except AttributeError: _local = None self.act.download(version=self.new_ver, local_pkg=_local) # check configs self.__check_config()
def _process(self, component=None, pattern=None, node=None, role=None): _unhealth_node = [] for _pd_node in self._cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='scaleIn') _current_pd_num = len(self._pd_status) _current_tikv_num = len(self._tikv_stores) if 'pd_servers' in self._diff and len( self._diff['pd_servers']) == _current_pd_num: term.fatal('Can not delete all pd node.') exit(1) if 'tikv_servers' in self._diff and len( self._diff['tikv_servers']) == _current_tikv_num: term.fatal('Can not delete all tikv node.') exit(1) term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, self._diff) if not component and not pattern: continue uuid = [x['uuid'] for x in self._diff[pattern]] term.normal('Delete {}, node list: {}'.format( component, ','.join(uuid))) for _uuid in uuid: self.__delete_component(self._diff, component, pattern, _uuid) if component not in ['tikv', 'pump', 'drainer']: self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.destroy_component(component=component, pattern=pattern, node=_uuid) if component != 'blackbox_exporter': self.topology.replace(self.topology.remove(_uuid)[0])
def _process(self, component=None, pattern=None, node=None, role=None): term.info('Check ssh connection.') self.act.check_ssh_connection() term.info('Stopping TiDB cluster.') for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue try: self.act.stop_component(component=component, pattern=pattern, node=node) except exceptions.TiOPSWarning as e: term.debug(str(e)) pass for service in self.topology.service_group[::-1]: component, pattern = self.check_exist(service, config=self.topology()) if not component and not pattern: continue term.normal('{} is being destroyed.'.format(component)) try: self.act.destroy_component(component=component, pattern=pattern, node=node) except exceptions.TiOPSWarning as e: term.debug(str(e)) pass # remove deploy dir self.ans.run_model('shell', 'rm -rf {{ full_deploy_dir | cluster_dir }}', become=True, group='*') self.ans.run_model('shell', 'rm -rf {{ full_data_dir | cluster_dir }}', become=True, group='*')
def init(self, demo=False): term.notice('Start init management machine.') key_home = utils.profile_path('.ssh') if not os.path.exists(key_home): utils.create_dir(key_home) os.chmod(os.path.join(key_home), 0o700) if not os.path.isfile(os.path.join(key_home, 'id_rsa')) or \ not os.path.isfile(os.path.join(key_home, 'id_rsa.pub')): term.info('There is not SSH key. Start generating.'.format( getpass.getuser())) os.system( '/usr/bin/ssh-keygen -t rsa -N \'\' -f {}/id_rsa -q'.format( key_home)) else: term.normal('Already have SSH key, skip create.'.format( getpass.getuser())) if demo: term.notice('Finished init management machine.') else: term.notice('Done!!!')
def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Running command on specified node in cluster.') elif role: term.notice('Running command on specified role in cluster.') else: term.notice('Running command on all node in cluster.') _topology = self.topology.role_node(roles=role, nodes=node) try: _sudo = self._args.root except AttributeError: _sudo = False term.info('Check ssh connection.') self.act.check_ssh_connection() for service in self.topology.service_group: component, pattern = self.check_exist(service, config=_topology) if not component and not pattern: continue if not node: term.info('Running command on {}.'.format(component)) self.__run(pattern=pattern, sudo=_sudo, cmd=self.cmd) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.info('Running command on {}, node list: {}.'.format( component, ','.join(_uuid))) self.__run(pattern=pattern, node=','.join(_uuid), sudo=_sudo, cmd=self.cmd)
def _process(self, component=None, pattern=None, node=None, role=None): term.info('Check ssh connection.') self.act.check_ssh_connection() self.act.edit_file() try: term.info('Create directory in all add nodes.') for service in self.topology.service_group: component, pattern = self.check_exist(service, self._diff) if not component and not pattern: continue uuid = [x['uuid'] for x in self._diff[pattern]] self.act.create_directory(component=component, pattern=pattern, node=','.join(uuid)) # check machine cpu / memory / disk self.act.check_machine_config(self._diff) # start run scale-out for service in self.topology.service_group: component, pattern = self.check_exist(service, self._diff) if not component and not pattern: continue uuid = [x['uuid'] for x in self._diff[pattern]] term.normal('Add {}, node list: {}.'.format( component, ','.join(uuid))) _template_dir = self.topology.cache_template_dir self.act.deploy_component(component=component, pattern=pattern, node=','.join(uuid), template_dir=_template_dir) self.act.deploy_firewall(component=component, pattern=pattern, node=','.join(uuid)) self.act.start_component(component=component, pattern=pattern, node=','.join(uuid)) finally: os.popen('rm -rf {}'.format(self.topology.cache_template_dir))
def init_network(self, demo=False): term.notice( 'Start creating no-password ssh connections between the management machine and other machines' ) self._check_ip_list() # get password from prompt if not self._args.password: term.info( 'Please enter the password of init user on deployment server ({} password)' .format(self.init_user)) _passwd = term.getpass() else: _passwd = self._args.password # create ansible runner initnet = ansibleapi.ANSRunner(ips=self.hosts, user=self.init_user, tiargs=self._args, passwd=_passwd) term.info('Create {} user on remote machines.'.format(self.user)) initnet.run_model('user', 'name=%s ' 'shell=/bin/bash ' 'createhome=yes' % (self.user), become=True) term.info('Set authorized_keys for {} on cluster machine.'.format( self.user)) initnet.run_model('authorized_key', 'user=%s ' 'key={{ lookup("file", "%s/id_rsa.pub") }}' % (self.user, utils.profile_path('.ssh')), become=True) term.info('Add sudo permissions for {} on cluster machine.'.format( self.user)) initnet.run_model('lineinfile', 'path=/etc/sudoers ' 'line="{} ALL=(ALL) NOPASSWD: ALL" ' 'regexp="^{} .*" ' 'insertafter=EOF ' 'state=present'.format(self.user, self.user), become=True) if demo: term.notice('Finished setting up SSH keys.') else: term.notice('Done!!!')
def _process(self, component=None, pattern=None, node=None, role=None): if node: term.notice('Reload specified node in cluster.') elif role: term.notice('Reload specified role in cluster.') else: term.notice('Reload TiDB cluster.') _topology = self.topology.role_node(roles=role, nodes=node) _cluster = modules.ClusterAPI(topology=self.topology) _unhealth_node = [] for _pd_node in _cluster.status(): if not _pd_node['health']: _unhealth_node.append(_pd_node['name']) msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format( ','.join(_unhealth_node)) term.fatal(msg) raise exceptions.TiOPSRuntimeError(msg, operation='reload') term.info('Check ssh connection.') self.act.check_ssh_connection() # every time should only contain one item for service in self.topology.service_group: component, pattern = self.check_exist(service=service, config=_topology) if not component and not pattern: continue # upgrade pd server, upgrade leader node finally if component == 'pd': _pd_list = [] for _node in _topology[pattern]: if _node['uuid'] == _cluster.pd_leader(): _leader = _node else: _pd_list.append(_node) _pd_list.append(_leader) for _node in _pd_list: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Reload {}, node id: {}.'.format( component, _uuid)) if _uuid == _cluster.pd_leader(): _cluster.evict_pd_leader(uuid=_uuid) self.act.deploy_component(component=component, pattern=pattern, node=_uuid) self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.start_component(component=component, pattern=pattern, node=_uuid) continue if pattern in [ 'monitored_servers', 'monitoring_server', 'grafana_server', 'alertmanager_server' ]: if not node: term.normal('Reload {}.'.format(component)) self.act.deploy_component(component=component, pattern=pattern) self.act.stop_component(component=component, pattern=pattern) self.act.start_component(component=component, pattern=pattern) else: _uuid = [x['uuid'] for x in _topology[pattern]] term.normal('Reload {}, node list: {}.'.format( component, ','.join(_uuid))) self.act.deploy_component(component=component, pattern=pattern, node=','.join(_uuid)) self.act.stop_component(component=component, pattern=pattern, node=','.join(_uuid)) self.act.start_component(component=component, pattern=pattern, node=','.join(_uuid)) continue for _node in _topology[pattern]: _uuid = _node['uuid'] _host = _node['ip'] term.normal('Reload {}, node id: {}.'.format(component, _uuid)) if pattern == 'tikv_servers': _port = _node['port'] _cluster.evict_store_leaders(host=_host, port=_port) self.act.deploy_component(component=component, pattern=pattern, node=_uuid) self.act.stop_component(component=component, pattern=pattern, node=_uuid) self.act.start_component(component=component, pattern=pattern, node=_uuid) if pattern == 'tikv_servers': _cluster.remove_evict(host=_host, port=_port)
def init_host(self, demo=False): term.notice('Begin initializing the cluster machine.') self._check_ip_list() inithost = ansibleapi.ANSRunner(ips=self.hosts, user=self.user, tiargs=self._args) inithost.run_model('file', 'path=%s ' 'state=directory ' 'owner=%s ' 'group=%s' % (self.check_dir, self.user, self.user), become=True) if not os.path.exists(self.host_vars): os.mkdir(self.host_vars) elif not os.path.isdir(self.host_vars): os.remove(self.host_vars) os.mkdir(self.host_vars) # get host environment vars term.info('Get all host environment vars.') setup = inithost.run_model('setup', 'gather_subset="all" ' 'gather_timeout=120') # record host environment vars for _host, _vars in setup['success'].iteritems(): __vars = json.loads(json.dumps(_vars)) utils.write_yaml( os.path.join(self.host_vars, '{}.yaml'.format(_host)), __vars) if self.enable_checks: term.info('Check OS platform.') self.check_os_platform(setup) term.info('Check OS version.') self.check_os_version(setup) # check cpu if support EPOLLEXCLUSIVE term.info('Check if CPU support EPOLLEXCLUSIVE.') inithost.run_model('copy', 'src=%s/{{ item }} ' 'dest=%s ' 'mode=0755' % (self.script_path, self.check_dir), with_items=['epollexclusive', 'run_epoll.sh']) check_epoll = inithost.run_model( 'shell', '%s/run_epoll.sh' % (self.check_dir)) _unsupport_epoll = [ host for host, info in check_epoll['success'].iteritems() if 'True' not in info['stdout'] ] if _unsupport_epoll: raise exceptions.TiOPSRuntimeError( 'CPU unsupport epollexclusive on {} machine, please change system version or upgrade kernel verion.' .format(','.join(_unsupport_epoll).replace('_', '.')), operation='init') # check systemd service version (systemd package, not operation system) _systemd = inithost.run_model('yum', 'list=systemd', become=True) term.info('Check systemd service version.') self.check_systemd_version(_systemd) if not demo: # check and set cpu mode to performance term.info('Set CPU performance mode if support.') # enable cpupower service inithost.run_model('systemd', 'name="cpupower"', 'state=started', 'enabled=yes') # set timezone if self.timezone: inithost.run_model( 'blockinfile', 'path=/home/%s/.bash_profile ' 'insertbefore="# End of file" ' 'block="export TZ=%s"' % (self.user, self.timezone)) # install ntp package if self.ntp_server: term.info('Install ntpd package.') inithost.run_model('yum', 'name="ntp" ' 'state=present', become=True) term.info('Add ntp servers to ntpd config and reload.') _ntp_config = [ 'server {} iburst'.format(server) for server in self.ntp_server.split(',') ] inithost.run_model('blockinfile', 'path="/etc/ntp.conf" ' 'insertbefore="# End of file" ' 'marker="#{mark} TiDB %s MANAGED BLOCK" ' 'block="%s"' % (self.user, '\n'.join(_ntp_config)), become=True) inithost.run_model('systemd', 'name=ntpd ' 'state=restarted ' 'enabled=yes', become=True) # check if time synchronize if self.enable_check_ntp: term.info('Check if NTP is running to synchronize the time.') result = inithost.run_model( 'shell', 'ntpstat | grep -w synchronised | wc -l') _unrun = [ host for host, info in result['success'].iteritems() if info['stderr'] ] if _unrun: raise exceptions.TiOPSRuntimeError( 'Ntp server may stopped on {} machine.'.format( ','.join(_unrun).replace('_', '.')), operation='init') _unsync = [ host for host, info in result['success'].iteritems() if info['stdout'] == str(0) ] if _unsync: raise exceptions.TiOPSRuntimeError( 'Time unsynchronised on {}, please check ntp status.'. format(','.join(_unsync).replace('_', '.')), operation='init') term.info('Update kernel parameters on cluster machine.') inithost.run_model('sysctl', 'name={{ item.name }} value={{ item.value }}', become=True, with_items=[{ 'name': 'net.ipv4.tcp_tw_recycle', 'value': 0 }, { 'name': 'net.core.somaxconn', "value": 32768 }, { 'name': 'vm.swappiness', "value": 0 }, { 'name': 'net.ipv4.tcp_syncookies', "value": 0 }, { 'name': 'fs.file-max', "value": 1000000 }]) inithost.run_model( 'blockinfile', 'path="/etc/security/limits.conf" ' 'insertbefore="# End of file" ' 'marker="#{mark} TiDB %s MANAGED BLOCK" ' 'block="%s soft nofile 1000000\n' '%s hard nofile 1000000\n' '%s soft stack 10240\n"' % (self.user, self.user, self.user, self.user), become=True) if not self.enable_swap: term.info('Turn off swap on remote machine.') inithost.run_model('shell', 'swapoff -a', become=True) else: term.info('Turn on swap on remote machine') inithost.run_model('shell', 'swapon -a', become=True) # disable selinux term.info('Disable selinux.') inithost.run_model('selinux', 'state=disabled', become=True) # set and start irqbalance if not demo and not self.disable_irqbalance: term.info('Set and start irqbalance.') inithost.run_model('lineinfile', 'dest=/etc/sysconfig/irqbalance ' 'regexp="(?<!_)ONESHOT=" ' 'line="ONESHOT=yes"', become=True) inithost.run_model('systemd', 'name=irqbalance.service ' 'state=restarted ' 'enabled=yes', become=True) if demo: term.notice('Finished init deployment machine.') else: term.notice('Done!!!')