Beispiel #1
0
    def _post(self, component=None, pattern=None, node=None, role=None):
        try:
            utils.remove_dir(utils.profile_path(self.topology.cluster_dir))
        except Exception as e:
            logging.warning(e)

        term.notice('TiDB cluster destroyed.')
Beispiel #2
0
    def _post(self, component=None, pattern=None, node=None, role=None):
        term.notice('Finished reload config for {} cluster.'.format(
            self.topology.version))

        print(term.bold_cyan('Success:'))
        for host, out in self._result['success'].items():
            _output = 'stdout: {}'.format(out['stdout'])
            if len(out['stderr']) > 0:
                _output += '\nstderr: {}'.format(out['stderr'])
            print(term.plain_green('{}:'.format(host)))
            print(term.plain(_output))

        if len(self._result['unreachable']) > 0:
            print(term.bold_yellow('Unreachable:'))
            for host, out in self._result['unreachable'].items():
                _output = 'stdout: {}'.format(out['stdout'])
                if len(out['stderr']) > 0:
                    _output += '\nstderr: {}'.format(out['stderr'])
                print(term.plain_yellow('{}:'.format(host)))
                print(term.plain(_output))

        if len(self._result['failed']) > 0:
            print(term.bold_red('Failed:'))
            for host, out in self._result['failed'].items():
                _output = 'stdout: {}'.format(out['stdout'])
                if len(out['stderr']) > 0:
                    _output += '\nstderr: {}'.format(out['stderr'])
                print(term.plain_red('{}:'.format(host)))
                print(term.plain(_output))
Beispiel #3
0
Datei: scale.py Projekt: nrc/tiup
    def _post(self, component=None, pattern=None, node=None, role=None):
        # if 'pd_servers' in self._diff:
        #    reload_pd = True
        # else:
        #    reload_pd = False
        self.topology.replace(self._new_topo)
        term.info('Update configuration.')
        ans = ansibleapi.ANSRunner(user=self.topology.user,
                                   topology=self.topology._topology(
                                       self._new_topo),
                                   tiargs=self._args)
        act = Action(ans=ans, topo=self.topology)
        if 'pd_servers' in self._diff:
            act.deploy_component(component='pd', pattern='pd_servers')
            act.deploy_component(component='tikv', pattern='tikv_servers')
            act.deploy_component(component='tidb', pattern='tidb_servers')
            act.deploy_component(component='pump', pattern='pump_servers')
            act.deploy_component(component='drainer',
                                 pattern='drainer_servers')

        act.deploy_component(component='prometheus',
                             pattern='monitoring_server')
        act.stop_component(component='prometheus', pattern='monitoring_server')
        act.start_component(component='prometheus',
                            pattern='monitoring_server')
        term.notice('Finished scaling out.')
Beispiel #4
0
    def _prepare(self, component=None, pattern=None, node=None, role=None):
        if self.topology.version and self._args.tidb_version:
            new_ver = self._args.tidb_version.lstrip('v')
            curr_ver = self.topology.version.lstrip('v')
            _cmp = semver.compare(curr_ver, new_ver)
            if _cmp > 0:
                raise exceptions.TiOPSArgumentError(
                    'Running version is {}, can\'t downgrade.'.format(
                        curr_ver))

        term.notice('Begin installing TiDB cluster.')
        # download packages
        term.info(
            'Downloading TiDB related binary, it may take a few minutes.')
        try:
            _local = self._args.local_pkg
        except AttributeError:
            _local = None
        self.act.download(local_pkg=_local)

        if not self.demo:
            # edit config
            self.act.edit_file()

            term.info('Check ssh connection.')
            self.act.check_ssh_connection()

            if self._args.enable_check_config:
                self._check_config()
Beispiel #5
0
 def _prepare(self, component=None, pattern=None, node=None, role=None):
     try:
         self.cmd = ' '.join(self._args.cmd)
     except AttributeError:
         raise exceptions.TiOPSArgumentError(
             'No command specified, do nothing.')
     term.notice('Run raw shell command on {} cluster.'.format(
         self.topology.cluster_name))
     term.normal('{}'.format(self.cmd))
Beispiel #6
0
 def _post(self, component=None, pattern=None, node=None, role=None):
     self.topology.set_meta()
     self.topology._save_topology()
     if self.demo:
         term.notice('Finished deploying TiDB cluster {} ({}).'.format(
             self.topology.cluster_name, self.topology.version))
     else:
         term.notice(
             'Finished deploying TiDB cluster {} ({}), don\'t forget to start it.'
             .format(self.topology.cluster_name, self.topology.version))
Beispiel #7
0
 def _prepare(self, component=None, pattern=None, node=None, role=None):
     term.warn('The TiDB cluster {} ({}) is going to be destroyed.'.format(
         self.topology.cluster_name, self.topology.version))
     rm_promt = 'This operation will ' + term.warn_red('remove') \
                + ' the TiDB cluster ' + term.highlight_red(self.topology.cluster_name) \
                + '. It can NOT be undone. ' + term.yes_no() + ':'
     notice = term.input(rm_promt)
     if notice.lower() not in ['y', 'yes']:
         term.notice('Terminate the destroy operation.')
         raise exceptions.TiOPSRuntimeError('Operation cancelled by user.')
Beispiel #8
0
Datei: scale.py Projekt: nrc/tiup
    def _post(self, component=None, pattern=None, node=None, role=None):
        ans = ansibleapi.ANSRunner(user=self.topology.user,
                                   topology=self.topology(),
                                   tiargs=self._args)
        act = Action(ans=ans, topo=self.topology)
        if 'pd_servers' in self._diff:
            act.deploy_component(component='pd', pattern='pd_servers')
            act.deploy_component(component='tikv', pattern='tikv_servers')
            act.deploy_component(component='tidb', pattern='tidb_servers')
            act.deploy_component(component='pump', pattern='pump_servers')
            act.deploy_component(component='drainer',
                                 pattern='drainer_servers')

        # self.deploy.deploy_component(component='prometheus', pattern='monitoring_server', ans=ans)
        # self.reload.do(component='prometheus', pattern='monitoring_server')

        term.notice('Finished scaling in.')
Beispiel #9
0
    def _prepare(self, component=None, pattern=None, node=None, role=None):
        # check versions before processing
        self.__check_version()

        term.notice('Upgrading from v{} to v{}.'.format(
            self.old_ver, self.new_ver))

        # download packages for new version
        term.info('Downloading TiDB related binary, it may take a few minutes.')
        try:
            _local = self._args.local_pkg
        except AttributeError:
            _local = None
        self.act.download(version=self.new_ver, local_pkg=_local)

        # check configs
        self.__check_config()
Beispiel #10
0
Datei: start.py Projekt: nrc/tiup
    def _process(self, component=None, pattern=None, node=None, role=None):
        if node:
            term.notice('Start specified node in cluster.')
        elif role:
            term.notice('Start specified role in cluster.')
        else:
            term.notice('Start TiDB cluster.')
        _topology = self.topology.role_node(roles=role, nodes=node)

        if not self.demo:
            term.info('Check ssh connection.')
            self.act.check_ssh_connection()

        for service in self.topology.service_group:
            component, pattern = self.check_exist(service, config=_topology)
            if not component and not pattern:
                continue
            if not node:
                term.normal('Starting {}.'.format(component))
                self.act.start_component(component, pattern)
            else:
                _uuid = [x['uuid'] for x in _topology[pattern]]
                term.normal('Starting {}, node list: {}.'.format(
                    component, ','.join(_uuid)))
                self.act.start_component(component, pattern, ','.join(_uuid))
Beispiel #11
0
    def _process(self, component=None, pattern=None, node=None, role=None):
        if node:
            term.notice('Running command on specified node in cluster.')
        elif role:
            term.notice('Running command on specified role in cluster.')
        else:
            term.notice('Running command on all node in cluster.')

        _topology = self.topology.role_node(roles=role, nodes=node)

        try:
            _sudo = self._args.root
        except AttributeError:
            _sudo = False

        term.info('Check ssh connection.')
        self.act.check_ssh_connection()

        for service in self.topology.service_group:
            component, pattern = self.check_exist(service, config=_topology)
            if not component and not pattern:
                continue
            if not node:
                term.info('Running command on {}.'.format(component))
                self.__run(pattern=pattern, sudo=_sudo, cmd=self.cmd)
            else:
                _uuid = [x['uuid'] for x in _topology[pattern]]
                term.info('Running command on {}, node list: {}.'.format(
                    component, ','.join(_uuid)))
                self.__run(pattern=pattern,
                           node=','.join(_uuid),
                           sudo=_sudo,
                           cmd=self.cmd)
Beispiel #12
0
Datei: scale.py Projekt: nrc/tiup
    def _prepare(self, component=None, pattern=None, node=None, role=None):
        if not self._diff:
            msg = 'No new nodes to scale out.'
            term.error(msg)
            raise exceptions.TiOPSConfigError(msg)
        term.notice('Begin add node for TiDB cluster.')

        # copy template
        utils.create_dir(self.topology.cache_template_dir)
        utils.copy_template(source=os.path.join(self.topology.titemplate_dir),
                            target=os.path.join(
                                self.topology.cache_template_dir))

        # update scripts when scale-out.
        for service in ['pd', 'tikv', 'tidb', 'pump', 'drainer']:
            if '{}_servers'.format(service) in self._diff:
                template_path = os.path.join(
                    self.topology.cache_template_dir,
                    'scripts/run_{}.sh.j2'.format(service))
                _original, new_template = utils.script_template(
                    path=self.topology.cluster_dir,
                    template=template_path,
                    service=service)
                utils.write_template(template_path, new_template)
Beispiel #13
0
Datei: init.py Projekt: nrc/tiup
    def init_network(self, demo=False):
        term.notice(
            'Start creating no-password ssh connections between the management machine and other machines'
        )
        self._check_ip_list()

        # get password from prompt
        if not self._args.password:
            term.info(
                'Please enter the password of init user on deployment server ({} password)'
                .format(self.init_user))
            _passwd = term.getpass()
        else:
            _passwd = self._args.password

        # create ansible runner
        initnet = ansibleapi.ANSRunner(ips=self.hosts,
                                       user=self.init_user,
                                       tiargs=self._args,
                                       passwd=_passwd)
        term.info('Create {} user on remote machines.'.format(self.user))
        initnet.run_model('user',
                          'name=%s '
                          'shell=/bin/bash '
                          'createhome=yes' % (self.user),
                          become=True)
        term.info('Set authorized_keys for {} on cluster machine.'.format(
            self.user))
        initnet.run_model('authorized_key',
                          'user=%s '
                          'key={{ lookup("file", "%s/id_rsa.pub") }}' %
                          (self.user, utils.profile_path('.ssh')),
                          become=True)
        term.info('Add sudo permissions for {} on cluster machine.'.format(
            self.user))
        initnet.run_model('lineinfile',
                          'path=/etc/sudoers '
                          'line="{} ALL=(ALL) NOPASSWD: ALL" '
                          'regexp="^{} .*" '
                          'insertafter=EOF '
                          'state=present'.format(self.user, self.user),
                          become=True)
        if demo:
            term.notice('Finished setting up SSH keys.')
        else:
            term.notice('Done!!!')
Beispiel #14
0
Datei: init.py Projekt: nrc/tiup
    def init(self, demo=False):
        term.notice('Start init management machine.')
        key_home = utils.profile_path('.ssh')
        if not os.path.exists(key_home):
            utils.create_dir(key_home)
            os.chmod(os.path.join(key_home), 0o700)
        if not os.path.isfile(os.path.join(key_home, 'id_rsa')) or \
                not os.path.isfile(os.path.join(key_home, 'id_rsa.pub')):
            term.info('There is not SSH key. Start generating.'.format(
                getpass.getuser()))
            os.system(
                '/usr/bin/ssh-keygen -t rsa -N \'\' -f {}/id_rsa -q'.format(
                    key_home))
        else:
            term.normal('Already have SSH key, skip create.'.format(
                getpass.getuser()))

        if demo:
            term.notice('Finished init management machine.')
        else:
            term.notice('Done!!!')
Beispiel #15
0
Datei: start.py Projekt: nrc/tiup
 def _post(self, component=None, pattern=None, node=None, role=None):
     term.notice('Finished start.')
Beispiel #16
0
 def _post(self, component=None, pattern=None, node=None, role=None):
     self.topology.set_meta(version=self.new_ver)
     term.notice('Upgraded to {}.'.format(self.topology.version))
Beispiel #17
0
Datei: scale.py Projekt: nrc/tiup
 def _prepare(self, component=None, pattern=None, node=None, role=None):
     term.notice('Begin delete node for TiDB cluster.')
     self._cluster = modules.ClusterAPI(topology=self.topology)
     self._pd_status = self._cluster.status()
     self._tikv_stores = self._cluster.tikv_stores()
Beispiel #18
0
if __name__ == '__main__':
    _parser = cmd.TiOPSParser()
    args = _parser()

    # add logging facilities, but outputs are not modified to use it yet
    if args.verbose:
        logging.basicConfig(
            filename=utils.profile_path("tiops.log"),
            format=
            '[%(asctime)s] [%(levelname)s] %(message)s (at %(filename)s:%(lineno)d in %(funcName)s).',
            datefmt='%Y-%m-%d %T %z',
            level=logging.DEBUG)
        logging.info("Using logging level: DEBUG.")
        logging.debug("Debug logging enabled.")
        logging.debug("Input arguments are: %s" % args)
    else:
        logging.basicConfig(
            filename=utils.profile_path("tiops.log"),
            format='[%(asctime)s] [%(levelname)s] %(message)s.',
            datefmt='%Y-%m-%d %T %z',
            level=logging.INFO)
        logging.info("Using logging level: INFO.")

    try:
        main(args)
    except KeyboardInterrupt:
        # not auto cleaning up because the operation may be un-defined
        term.notice(
            'Process interrupted, make sure to check if configs are correct.')
Beispiel #19
0
Datei: init.py Projekt: nrc/tiup
    def init_host(self, demo=False):
        term.notice('Begin initializing the cluster machine.')
        self._check_ip_list()
        inithost = ansibleapi.ANSRunner(ips=self.hosts,
                                        user=self.user,
                                        tiargs=self._args)

        inithost.run_model('file',
                           'path=%s '
                           'state=directory '
                           'owner=%s '
                           'group=%s' % (self.check_dir, self.user, self.user),
                           become=True)

        if not os.path.exists(self.host_vars):
            os.mkdir(self.host_vars)
        elif not os.path.isdir(self.host_vars):
            os.remove(self.host_vars)
            os.mkdir(self.host_vars)

        # get host environment vars
        term.info('Get all host environment vars.')
        setup = inithost.run_model('setup', 'gather_subset="all" '
                                   'gather_timeout=120')

        # record host environment vars
        for _host, _vars in setup['success'].iteritems():
            __vars = json.loads(json.dumps(_vars))
            utils.write_yaml(
                os.path.join(self.host_vars, '{}.yaml'.format(_host)), __vars)

        if self.enable_checks:
            term.info('Check OS platform.')
            self.check_os_platform(setup)

            term.info('Check OS version.')
            self.check_os_version(setup)

            # check cpu if support EPOLLEXCLUSIVE
            term.info('Check if CPU support EPOLLEXCLUSIVE.')
            inithost.run_model('copy',
                               'src=%s/{{ item }} '
                               'dest=%s '
                               'mode=0755' %
                               (self.script_path, self.check_dir),
                               with_items=['epollexclusive', 'run_epoll.sh'])

            check_epoll = inithost.run_model(
                'shell', '%s/run_epoll.sh' % (self.check_dir))

            _unsupport_epoll = [
                host for host, info in check_epoll['success'].iteritems()
                if 'True' not in info['stdout']
            ]

            if _unsupport_epoll:
                raise exceptions.TiOPSRuntimeError(
                    'CPU unsupport epollexclusive on {} machine, please change system version or upgrade kernel verion.'
                    .format(','.join(_unsupport_epoll).replace('_', '.')),
                    operation='init')

            # check systemd service version (systemd package, not operation system)
            _systemd = inithost.run_model('yum', 'list=systemd', become=True)
            term.info('Check systemd service version.')
            self.check_systemd_version(_systemd)

        if not demo:
            # check and set cpu mode to performance
            term.info('Set CPU performance mode if support.')

            # enable cpupower service
            inithost.run_model('systemd', 'name="cpupower"', 'state=started',
                               'enabled=yes')

            # set timezone
            if self.timezone:
                inithost.run_model(
                    'blockinfile', 'path=/home/%s/.bash_profile '
                    'insertbefore="# End of file" '
                    'block="export TZ=%s"' % (self.user, self.timezone))

            # install ntp package
            if self.ntp_server:
                term.info('Install ntpd package.')
                inithost.run_model('yum', 'name="ntp" '
                                   'state=present',
                                   become=True)

                term.info('Add ntp servers to ntpd config and reload.')

                _ntp_config = [
                    'server {} iburst'.format(server)
                    for server in self.ntp_server.split(',')
                ]

                inithost.run_model('blockinfile',
                                   'path="/etc/ntp.conf" '
                                   'insertbefore="# End of file" '
                                   'marker="#{mark} TiDB %s MANAGED BLOCK" '
                                   'block="%s"' %
                                   (self.user, '\n'.join(_ntp_config)),
                                   become=True)

                inithost.run_model('systemd', 'name=ntpd '
                                   'state=restarted '
                                   'enabled=yes',
                                   become=True)

            # check if time synchronize
            if self.enable_check_ntp:
                term.info('Check if NTP is running to synchronize the time.')

                result = inithost.run_model(
                    'shell', 'ntpstat | grep -w synchronised | wc -l')

                _unrun = [
                    host for host, info in result['success'].iteritems()
                    if info['stderr']
                ]
                if _unrun:
                    raise exceptions.TiOPSRuntimeError(
                        'Ntp server may stopped on {} machine.'.format(
                            ','.join(_unrun).replace('_', '.')),
                        operation='init')

                _unsync = [
                    host for host, info in result['success'].iteritems()
                    if info['stdout'] == str(0)
                ]
                if _unsync:
                    raise exceptions.TiOPSRuntimeError(
                        'Time unsynchronised on {}, please check ntp status.'.
                        format(','.join(_unsync).replace('_', '.')),
                        operation='init')

            term.info('Update kernel parameters on cluster machine.')
            inithost.run_model('sysctl',
                               'name={{ item.name }} value={{ item.value }}',
                               become=True,
                               with_items=[{
                                   'name': 'net.ipv4.tcp_tw_recycle',
                                   'value': 0
                               }, {
                                   'name': 'net.core.somaxconn',
                                   "value": 32768
                               }, {
                                   'name': 'vm.swappiness',
                                   "value": 0
                               }, {
                                   'name': 'net.ipv4.tcp_syncookies',
                                   "value": 0
                               }, {
                                   'name': 'fs.file-max',
                                   "value": 1000000
                               }])
            inithost.run_model(
                'blockinfile',
                'path="/etc/security/limits.conf" '
                'insertbefore="# End of file" '
                'marker="#{mark} TiDB %s MANAGED BLOCK" '
                'block="%s        soft        nofile        1000000\n'
                '%s        hard        nofile        1000000\n'
                '%s        soft        stack         10240\n"' %
                (self.user, self.user, self.user, self.user),
                become=True)

            if not self.enable_swap:
                term.info('Turn off swap on remote machine.')
                inithost.run_model('shell', 'swapoff -a', become=True)
            else:
                term.info('Turn on swap on remote machine')
                inithost.run_model('shell', 'swapon -a', become=True)

        # disable selinux
        term.info('Disable selinux.')
        inithost.run_model('selinux', 'state=disabled', become=True)

        # set and start irqbalance
        if not demo and not self.disable_irqbalance:
            term.info('Set and start irqbalance.')
            inithost.run_model('lineinfile', 'dest=/etc/sysconfig/irqbalance '
                               'regexp="(?<!_)ONESHOT=" '
                               'line="ONESHOT=yes"',
                               become=True)
            inithost.run_model('systemd', 'name=irqbalance.service '
                               'state=restarted '
                               'enabled=yes',
                               become=True)

        if demo:
            term.notice('Finished init deployment machine.')
        else:
            term.notice('Done!!!')
Beispiel #20
0
    def _process(self, component=None, pattern=None, node=None, role=None):
        if node:
            term.notice('Reload specified node in cluster.')
        elif role:
            term.notice('Reload specified role in cluster.')
        else:
            term.notice('Reload TiDB cluster.')
        _topology = self.topology.role_node(roles=role, nodes=node)

        _cluster = modules.ClusterAPI(topology=self.topology)
        _unhealth_node = []
        for _pd_node in _cluster.status():
            if not _pd_node['health']:
                _unhealth_node.append(_pd_node['name'])
                msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format(
                    ','.join(_unhealth_node))
                term.fatal(msg)
                raise exceptions.TiOPSRuntimeError(msg, operation='reload')

        term.info('Check ssh connection.')
        self.act.check_ssh_connection()
        # every time should only contain one item
        for service in self.topology.service_group:
            component, pattern = self.check_exist(service=service,
                                                  config=_topology)
            if not component and not pattern:
                continue
            # upgrade pd server, upgrade leader node finally
            if component == 'pd':
                _pd_list = []
                for _node in _topology[pattern]:
                    if _node['uuid'] == _cluster.pd_leader():
                        _leader = _node
                    else:
                        _pd_list.append(_node)
                _pd_list.append(_leader)

                for _node in _pd_list:
                    _uuid = _node['uuid']
                    _host = _node['ip']
                    term.normal('Reload {}, node id: {}.'.format(
                        component, _uuid))
                    if _uuid == _cluster.pd_leader():
                        _cluster.evict_pd_leader(uuid=_uuid)

                    self.act.deploy_component(component=component,
                                              pattern=pattern,
                                              node=_uuid)
                    self.act.stop_component(component=component,
                                            pattern=pattern,
                                            node=_uuid)
                    self.act.start_component(component=component,
                                             pattern=pattern,
                                             node=_uuid)
                continue

            if pattern in [
                    'monitored_servers', 'monitoring_server', 'grafana_server',
                    'alertmanager_server'
            ]:
                if not node:
                    term.normal('Reload {}.'.format(component))
                    self.act.deploy_component(component=component,
                                              pattern=pattern)
                    self.act.stop_component(component=component,
                                            pattern=pattern)
                    self.act.start_component(component=component,
                                             pattern=pattern)
                else:
                    _uuid = [x['uuid'] for x in _topology[pattern]]
                    term.normal('Reload {}, node list: {}.'.format(
                        component, ','.join(_uuid)))
                    self.act.deploy_component(component=component,
                                              pattern=pattern,
                                              node=','.join(_uuid))
                    self.act.stop_component(component=component,
                                            pattern=pattern,
                                            node=','.join(_uuid))
                    self.act.start_component(component=component,
                                             pattern=pattern,
                                             node=','.join(_uuid))
                continue

            for _node in _topology[pattern]:
                _uuid = _node['uuid']
                _host = _node['ip']
                term.normal('Reload {}, node id: {}.'.format(component, _uuid))
                if pattern == 'tikv_servers':
                    _port = _node['port']
                    _cluster.evict_store_leaders(host=_host, port=_port)
                self.act.deploy_component(component=component,
                                          pattern=pattern,
                                          node=_uuid)
                self.act.stop_component(component=component,
                                        pattern=pattern,
                                        node=_uuid)
                self.act.start_component(component=component,
                                         pattern=pattern,
                                         node=_uuid)

                if pattern == 'tikv_servers':
                    _cluster.remove_evict(host=_host, port=_port)
Beispiel #21
0
 def _post(self, component=None, pattern=None, node=None, role=None):
     term.notice('Finished reload config for {} cluster.'.format(
         self.topology.version))