コード例 #1
0
 def __init__(self, args=None, topology=None):
     super(OprExec, self).__init__(args, topology)
     self.act = Action(ans=self.ans, topo=self.topology)
     self._result = {
         'failed': {},
         'success': {},
         'unreachable': {},
     }
コード例 #2
0
ファイル: scale.py プロジェクト: nrc/tiup
 def __init__(self, args=None, topology=None, new_srvs=None):
     if os.path.exists(topology.topology_file):
         term.warn(
             'Check TiDB cluster {} status, it may take a few minutes.'.
             format(topology.cluster_name))
         self.check_tombstone(topology, args)
     self._new_topo, self._diff = topology.add(new_srvs)
     topology.replace(self._new_topo, write=False)
     super(OprScaleOut, self).__init__(args, topology, action='deploy')
     self.act = Action(ans=self.ans, topo=self.topology)
コード例 #3
0
ファイル: scale.py プロジェクト: nrc/tiup
    def __init__(self, args=None, topology=None, node=None):
        if not node:
            msg = 'Node ID not specified.'
            term.error(msg)
            raise exceptions.TiOPSConfigError(msg)

        self._new_topo, self._diff = topology.remove(node)

        super(OprScaleIn, self).__init__(args, topology)
        self.act = Action(ans=self.ans, topo=self.topology)
コード例 #4
0
ファイル: upgrade.py プロジェクト: nrc/tiup
 def __init__(self, args=None, topology=None):
     super(OprUpgrade, self).__init__(args, topology)
     self.act = Action(ans=self.ans, topo=self.topology)
     try:
         self.arg_ver = args.tidb_version
     except AttributeError:
         raise exceptions.TiOPSConfigError(
             '--tidb-version is not set when upgrade, abort.')
     try:
         self.force = args.force
     except AttributeError:
         self.force = False
コード例 #5
0
ファイル: scale.py プロジェクト: nrc/tiup
    def _post(self, component=None, pattern=None, node=None, role=None):
        ans = ansibleapi.ANSRunner(user=self.topology.user,
                                   topology=self.topology(),
                                   tiargs=self._args)
        act = Action(ans=ans, topo=self.topology)
        if 'pd_servers' in self._diff:
            act.deploy_component(component='pd', pattern='pd_servers')
            act.deploy_component(component='tikv', pattern='tikv_servers')
            act.deploy_component(component='tidb', pattern='tidb_servers')
            act.deploy_component(component='pump', pattern='pump_servers')
            act.deploy_component(component='drainer',
                                 pattern='drainer_servers')

        # self.deploy.deploy_component(component='prometheus', pattern='monitoring_server', ans=ans)
        # self.reload.do(component='prometheus', pattern='monitoring_server')

        term.notice('Finished scaling in.')
コード例 #6
0
class OprRestart(OperationBase):
    def __init__(self, args=None, topology=None):
        super(OprRestart, self).__init__(args, topology)
        self.act = Action(ans=self.ans, topo=self.topology)

    def _process(self, component=None, pattern=None, node=None, role=None):
        if node:
            term.notice('Restart specified node in cluster.')
        elif role:
            term.notice('Restart specified role in cluster.')
        else:
            term.notice('Restart TiDB cluster.')
        _topology = self.topology.role_node(roles=role, nodes=node)

        term.info('Check ssh connection.')
        self.act.check_ssh_connection()

        for service in self.topology.service_group[::-1]:
            component, pattern = self.check_exist(service, config=_topology)
            if not component and not pattern:
                continue
            if not node:
                term.normal('Stopping {}.'.format(component))
                self.act.stop_component(component, pattern)
            else:
                _uuid = [x['uuid'] for x in _topology[pattern]]
                term.normal('Stopping {}, node list: {}.'.format(
                    component, ','.join(_uuid)))
                self.act.stop_component(component, pattern, ','.join(_uuid))

        for service in self.topology.service_group:
            component, pattern = self.check_exist(service, config=_topology)
            if not component and not pattern:
                continue
            if not node:
                term.normal('Starting {}.'.format(component))
                self.act.start_component(component, pattern)
            else:
                _uuid = [x['uuid'] for x in _topology[pattern]]
                term.normal('Starting {}, node list: {}.'.format(
                    component, ','.join(_uuid)))
                self.act.start_component(component, pattern, ','.join(_uuid))

    def _post(self, component=None, pattern=None, node=None, role=None):
        term.notice('Finished restart.')
コード例 #7
0
ファイル: scale.py プロジェクト: nrc/tiup
    def _post(self, component=None, pattern=None, node=None, role=None):
        # if 'pd_servers' in self._diff:
        #    reload_pd = True
        # else:
        #    reload_pd = False
        self.topology.replace(self._new_topo)
        term.info('Update configuration.')
        ans = ansibleapi.ANSRunner(user=self.topology.user,
                                   topology=self.topology._topology(
                                       self._new_topo),
                                   tiargs=self._args)
        act = Action(ans=ans, topo=self.topology)
        if 'pd_servers' in self._diff:
            act.deploy_component(component='pd', pattern='pd_servers')
            act.deploy_component(component='tikv', pattern='tikv_servers')
            act.deploy_component(component='tidb', pattern='tidb_servers')
            act.deploy_component(component='pump', pattern='pump_servers')
            act.deploy_component(component='drainer',
                                 pattern='drainer_servers')

        act.deploy_component(component='prometheus',
                             pattern='monitoring_server')
        act.stop_component(component='prometheus', pattern='monitoring_server')
        act.start_component(component='prometheus',
                            pattern='monitoring_server')
        term.notice('Finished scaling out.')
コード例 #8
0
ファイル: scale.py プロジェクト: nrc/tiup
class OprScaleOut(OperationBase):
    def __init__(self, args=None, topology=None, new_srvs=None):
        if os.path.exists(topology.topology_file):
            term.warn(
                'Check TiDB cluster {} status, it may take a few minutes.'.
                format(topology.cluster_name))
            self.check_tombstone(topology, args)
        self._new_topo, self._diff = topology.add(new_srvs)
        topology.replace(self._new_topo, write=False)
        super(OprScaleOut, self).__init__(args, topology, action='deploy')
        self.act = Action(ans=self.ans, topo=self.topology)

    def _prepare(self, component=None, pattern=None, node=None, role=None):
        if not self._diff:
            msg = 'No new nodes to scale out.'
            term.error(msg)
            raise exceptions.TiOPSConfigError(msg)
        term.notice('Begin add node for TiDB cluster.')

        # copy template
        utils.create_dir(self.topology.cache_template_dir)
        utils.copy_template(source=os.path.join(self.topology.titemplate_dir),
                            target=os.path.join(
                                self.topology.cache_template_dir))

        # update scripts when scale-out.
        for service in ['pd', 'tikv', 'tidb', 'pump', 'drainer']:
            if '{}_servers'.format(service) in self._diff:
                template_path = os.path.join(
                    self.topology.cache_template_dir,
                    'scripts/run_{}.sh.j2'.format(service))
                _original, new_template = utils.script_template(
                    path=self.topology.cluster_dir,
                    template=template_path,
                    service=service)
                utils.write_template(template_path, new_template)

    def _process(self, component=None, pattern=None, node=None, role=None):
        term.info('Check ssh connection.')
        self.act.check_ssh_connection()
        self.act.edit_file()
        try:
            term.info('Create directory in all add nodes.')
            for service in self.topology.service_group:
                component, pattern = self.check_exist(service, self._diff)
                if not component and not pattern:
                    continue
                uuid = [x['uuid'] for x in self._diff[pattern]]
                self.act.create_directory(component=component,
                                          pattern=pattern,
                                          node=','.join(uuid))

            # check machine cpu / memory / disk
            self.act.check_machine_config(self._diff)
            # start run scale-out
            for service in self.topology.service_group:
                component, pattern = self.check_exist(service, self._diff)
                if not component and not pattern:
                    continue
                uuid = [x['uuid'] for x in self._diff[pattern]]
                term.normal('Add {}, node list: {}.'.format(
                    component, ','.join(uuid)))
                _template_dir = self.topology.cache_template_dir
                self.act.deploy_component(component=component,
                                          pattern=pattern,
                                          node=','.join(uuid),
                                          template_dir=_template_dir)
                self.act.deploy_firewall(component=component,
                                         pattern=pattern,
                                         node=','.join(uuid))
                self.act.start_component(component=component,
                                         pattern=pattern,
                                         node=','.join(uuid))
        finally:
            os.popen('rm -rf {}'.format(self.topology.cache_template_dir))

    def _post(self, component=None, pattern=None, node=None, role=None):
        # if 'pd_servers' in self._diff:
        #    reload_pd = True
        # else:
        #    reload_pd = False
        self.topology.replace(self._new_topo)
        term.info('Update configuration.')
        ans = ansibleapi.ANSRunner(user=self.topology.user,
                                   topology=self.topology._topology(
                                       self._new_topo),
                                   tiargs=self._args)
        act = Action(ans=ans, topo=self.topology)
        if 'pd_servers' in self._diff:
            act.deploy_component(component='pd', pattern='pd_servers')
            act.deploy_component(component='tikv', pattern='tikv_servers')
            act.deploy_component(component='tidb', pattern='tidb_servers')
            act.deploy_component(component='pump', pattern='pump_servers')
            act.deploy_component(component='drainer',
                                 pattern='drainer_servers')

        act.deploy_component(component='prometheus',
                             pattern='monitoring_server')
        act.stop_component(component='prometheus', pattern='monitoring_server')
        act.start_component(component='prometheus',
                            pattern='monitoring_server')
        term.notice('Finished scaling out.')
コード例 #9
0
ファイル: scale.py プロジェクト: nrc/tiup
class OprScaleIn(OperationBase):
    def __init__(self, args=None, topology=None, node=None):
        if not node:
            msg = 'Node ID not specified.'
            term.error(msg)
            raise exceptions.TiOPSConfigError(msg)

        self._new_topo, self._diff = topology.remove(node)

        super(OprScaleIn, self).__init__(args, topology)
        self.act = Action(ans=self.ans, topo=self.topology)

    def _prepare(self, component=None, pattern=None, node=None, role=None):
        term.notice('Begin delete node for TiDB cluster.')
        self._cluster = modules.ClusterAPI(topology=self.topology)
        self._pd_status = self._cluster.status()
        self._tikv_stores = self._cluster.tikv_stores()

    def _process(self, component=None, pattern=None, node=None, role=None):
        _unhealth_node = []
        for _pd_node in self._cluster.status():
            if not _pd_node['health']:
                _unhealth_node.append(_pd_node['name'])
                msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format(
                    ','.join(_unhealth_node))
                term.fatal(msg)
                raise exceptions.TiOPSRuntimeError(msg, operation='scaleIn')

        _current_pd_num = len(self._pd_status)
        _current_tikv_num = len(self._tikv_stores)

        if 'pd_servers' in self._diff and len(
                self._diff['pd_servers']) == _current_pd_num:
            term.fatal('Can not delete all pd node.')
            exit(1)

        if 'tikv_servers' in self._diff and len(
                self._diff['tikv_servers']) == _current_tikv_num:
            term.fatal('Can not delete all tikv node.')
            exit(1)

        term.info('Check ssh connection.')
        self.act.check_ssh_connection()

        for service in self.topology.service_group[::-1]:
            component, pattern = self.check_exist(service, self._diff)
            if not component and not pattern:
                continue
            uuid = [x['uuid'] for x in self._diff[pattern]]
            term.normal('Delete {}, node list: {}'.format(
                component, ','.join(uuid)))
            for _uuid in uuid:
                self.__delete_component(self._diff, component, pattern, _uuid)
                if component not in ['tikv', 'pump', 'drainer']:
                    self.act.stop_component(component=component,
                                            pattern=pattern,
                                            node=_uuid)
                    self.act.destroy_component(component=component,
                                               pattern=pattern,
                                               node=_uuid)
                if component != 'blackbox_exporter':
                    self.topology.replace(self.topology.remove(_uuid)[0])

    def _post(self, component=None, pattern=None, node=None, role=None):
        ans = ansibleapi.ANSRunner(user=self.topology.user,
                                   topology=self.topology(),
                                   tiargs=self._args)
        act = Action(ans=ans, topo=self.topology)
        if 'pd_servers' in self._diff:
            act.deploy_component(component='pd', pattern='pd_servers')
            act.deploy_component(component='tikv', pattern='tikv_servers')
            act.deploy_component(component='tidb', pattern='tidb_servers')
            act.deploy_component(component='pump', pattern='pump_servers')
            act.deploy_component(component='drainer',
                                 pattern='drainer_servers')

        # self.deploy.deploy_component(component='prometheus', pattern='monitoring_server', ans=ans)
        # self.reload.do(component='prometheus', pattern='monitoring_server')

        term.notice('Finished scaling in.')

    def __delete_component(self,
                           config=None,
                           component=None,
                           pattern=None,
                           uuid=None):
        if component == 'pd':
            try:
                self._cluster.del_pd(uuid)
            except exceptions.TiOPSException as e:
                term.fatal(
                    'Unable to delete PD node from cluster: {}'.format(e))
                exit(1)

        if component == 'tikv':
            _tikv_info = ''
            for _tikv_node in config[pattern]:
                if _tikv_node['uuid'] != uuid:
                    continue
                if _tikv_node['offline']:
                    return
                _tikv_info = _tikv_node
            for ctikv in self._tikv_stores['stores']:
                # check if node in cluster
                if '{}:{}'.format(
                        _tikv_info['ip'],
                        _tikv_info['port']) == ctikv['store']['address']:
                    _store_id = ctikv['store']['id']

                    # delete store through api
                    try:
                        self._cluster.del_store(_store_id)
                    except exceptions.TiOPSException as e:
                        term.fatal('Unable to delete store: {}'.format(e))
                        exit(1)

        if component == 'drainer':
            _binlog = modules.BinlogAPI(topology=self.topology)
            _binlog.delete_drainer(node_id=uuid)

        if component == 'pump':
            _binlog = modules.BinlogAPI(topology=self.topology)
            _binlog.delete_pump(node_id=uuid)
コード例 #10
0
ファイル: upgrade.py プロジェクト: nrc/tiup
class OprUpgrade(OprDeploy):
    def __init__(self, args=None, topology=None):
        super(OprUpgrade, self).__init__(args, topology)
        self.act = Action(ans=self.ans, topo=self.topology)
        try:
            self.arg_ver = args.tidb_version
        except AttributeError:
            raise exceptions.TiOPSConfigError(
                '--tidb-version is not set when upgrade, abort.')
        try:
            self.force = args.force
        except AttributeError:
            self.force = False

    # check versions, it update version related variables in memory, but not writting them to disk
    def __check_version(self):
        new_ver = self.arg_ver.lstrip('v')
        curr_ver = self.topology.version.lstrip('v')
        _cmp = semver.compare(curr_ver, new_ver)
        if _cmp == 0:
            raise exceptions.TiOPSArgumentError(
                'Already running version {}.'.format(curr_ver))
        elif _cmp > 0:
            raise exceptions.TiOPSRuntimeError(
                'Downgrade is not supported, keep running {}.'.format(curr_ver), operation='upgrade')

        # update version and related variables
        self.old_ver = curr_ver
        self.new_ver = new_ver
        self.topology.version = 'v{}'.format(new_ver)
        self.topology.tiversion_dir = os.path.join(
            self.topology.tidown_dir, '{}'.format(self.topology.version))
        self.topology.resource_dir = utils.profile_path(
            'downloads', '{}/resources'.format(self.topology.version))
        self.topology.dashboard_dir = utils.profile_path(
            'downloads', '{}/dashboards'.format(self.topology.version))
        self.topology.package_dir = utils.profile_path(
            'downloads', '{}/packages'.format(self.topology.version))
        self.topology.config_dir = utils.profile_path(
            'downloads', '{}/configs'.format(self.topology.version))

    # Check if the configuration of the tidb component is reasonable
    def _check_config(self, topology=None):
        if not topology:
            topology = self.topology()
        _servers = [
            {'pd': 'pd_servers'},
            {'tikv': 'tikv_servers'},
            {'tidb': 'tidb_servers'},
        ]

        for _service in _servers:
            _component, _pattern = self.check_exist(
                _service, config=topology)
            if not _component and not _pattern:
                continue
            term.info('Check {} configuration.'.format(_component))
            self.act.configCheck(component=_component, pattern=_pattern, node=topology[_pattern][0]['uuid'])

    # TODO: check and merge configs
    def __check_config(self):
        pass

    def _prepare(self, component=None, pattern=None, node=None, role=None):
        # check versions before processing
        self.__check_version()

        term.notice('Upgrading from v{} to v{}.'.format(
            self.old_ver, self.new_ver))

        # download packages for new version
        term.info('Downloading TiDB related binary, it may take a few minutes.')
        try:
            _local = self._args.local_pkg
        except AttributeError:
            _local = None
        self.act.download(version=self.new_ver, local_pkg=_local)

        # check configs
        self.__check_config()

    def _process(self, component=None, pattern=None, node=None, role=None):
        if node:
            term.notice('Upgrade specified node in cluster.')
        elif role:
            term.notice('Upgrade specified role in cluster.')
        else:
            term.notice('Upgrade TiDB cluster.')
        _topology = self.topology.role_node(roles=role, nodes=node)
        if self._args.enable_check_config:
            self._check_config()
        # for service in ['pd', 'tikv', 'pump', 'tidb']:
        # grp = [x for x in self.topology.service_group if service in x.keys()]
        _cluster = modules.ClusterAPI(topology=self.topology)
        _unhealth_node = []
        for _pd_node in _cluster.status():
            if not _pd_node['health']:
                _unhealth_node.append(_pd_node['name'])
                msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format(
                    ','.join(_unhealth_node))
                term.fatal(msg)
                raise exceptions.TiOPSRuntimeError(msg, operation='upgrade')

        term.info('Check ssh connection.')
        self.act.check_ssh_connection()

        if self.force:
            for service in self.topology.service_group:
                component, pattern = self.check_exist(
                    service=service, config=_topology)
                if not component and not pattern:
                    continue
                if pattern in ['monitored_servers', 'monitoring_server', 'grafana_server', 'alertmanager_server']:
                    term.normal('Upgrade {}.'.format(component))
                    self.act.deploy_component(
                        component=component, pattern=pattern)
                    self.act.stop_component(
                        component=component, pattern=pattern)
                    self.act.start_component(
                        component=component, pattern=pattern)
                    continue

                for _node in _topology[pattern]:
                    _uuid = _node['uuid']
                    term.normal('Upgrade {}, node id: {}.'.format(
                        component, _uuid))
                    self.act.deploy_component(
                        component=component, pattern=pattern, node=_uuid)
                    self.act.stop_component(
                        component=component, pattern=pattern, node=_uuid)
                    self.act.start_component(
                        component=component, pattern=pattern, node=_uuid)
            return

        # every time should only contain one item
        for service in self.topology.service_group:
            component, pattern = self.check_exist(
                service=service, config=_topology)
            if not component and not pattern:
                continue
            # upgrade pd server, upgrade leader node finally
            if component == 'pd':
                _pd_list = []
                for _node in _topology[pattern]:
                    if _node['uuid'] == _cluster.pd_leader():
                        _leader = _node
                    else:
                        _pd_list.append(_node)
                _pd_list.append(_leader)

                for _node in _pd_list:
                    _uuid = _node['uuid']
                    _host = _node['ip']
                    term.normal('Upgrade {}, node id: {}.'.format(
                        component, _uuid))
                    if _uuid == _cluster.pd_leader():
                        _cluster.evict_pd_leader(uuid=_uuid)

                    self.act.deploy_component(
                        component=component, pattern=pattern, node=_uuid)
                    self.act.stop_component(
                        component=component, pattern=pattern, node=_uuid)
                    self.act.start_component(
                        component=component, pattern=pattern, node=_uuid)
                continue

            if pattern in ['monitored_servers', 'monitoring_server', 'grafana_server', 'alertmanager_server']:
                term.normal('Upgrade {}.'.format(component))
                self.act.deploy_component(component=component, pattern=pattern)
                self.act.stop_component(component=component, pattern=pattern)
                self.act.start_component(component=component, pattern=pattern)
                continue

            for _node in _topology[pattern]:
                _uuid = _node['uuid']
                _host = _node['ip']
                term.normal('Upgrade {}, node id: {}.'.format(component, _uuid))
                if pattern == 'tikv_servers':
                    _port = _node['port']
                    _cluster.evict_store_leaders(host=_host, port=_port)

                self.act.deploy_component(
                    component=component, pattern=pattern, node=_uuid)
                self.act.stop_component(
                    component=component, pattern=pattern, node=_uuid)
                self.act.start_component(
                    component=component, pattern=pattern, node=_uuid)

                if pattern == 'tikv_servers':
                    _cluster.remove_evict(host=_host, port=_port)

    def _post(self, component=None, pattern=None, node=None, role=None):
        self.topology.set_meta(version=self.new_ver)
        term.notice('Upgraded to {}.'.format(self.topology.version))
コード例 #11
0
class OprExec(OperationBase):
    def __init__(self, args=None, topology=None):
        super(OprExec, self).__init__(args, topology)
        self.act = Action(ans=self.ans, topo=self.topology)
        self._result = {
            'failed': {},
            'success': {},
            'unreachable': {},
        }

    def _prepare(self, component=None, pattern=None, node=None, role=None):
        try:
            self.cmd = ' '.join(self._args.cmd)
        except AttributeError:
            raise exceptions.TiOPSArgumentError(
                'No command specified, do nothing.')
        term.notice('Run raw shell command on {} cluster.'.format(
            self.topology.cluster_name))
        term.normal('{}'.format(self.cmd))

    def _process(self, component=None, pattern=None, node=None, role=None):
        if node:
            term.notice('Running command on specified node in cluster.')
        elif role:
            term.notice('Running command on specified role in cluster.')
        else:
            term.notice('Running command on all node in cluster.')

        _topology = self.topology.role_node(roles=role, nodes=node)

        try:
            _sudo = self._args.root
        except AttributeError:
            _sudo = False

        term.info('Check ssh connection.')
        self.act.check_ssh_connection()

        for service in self.topology.service_group:
            component, pattern = self.check_exist(service, config=_topology)
            if not component and not pattern:
                continue
            if not node:
                term.info('Running command on {}.'.format(component))
                self.__run(pattern=pattern, sudo=_sudo, cmd=self.cmd)
            else:
                _uuid = [x['uuid'] for x in _topology[pattern]]
                term.info('Running command on {}, node list: {}.'.format(
                    component, ','.join(_uuid)))
                self.__run(pattern=pattern,
                           node=','.join(_uuid),
                           sudo=_sudo,
                           cmd=self.cmd)

    def _post(self, component=None, pattern=None, node=None, role=None):
        term.notice('Finished reload config for {} cluster.'.format(
            self.topology.version))

        print(term.bold_cyan('Success:'))
        for host, out in self._result['success'].items():
            _output = 'stdout: {}'.format(out['stdout'])
            if len(out['stderr']) > 0:
                _output += '\nstderr: {}'.format(out['stderr'])
            print(term.plain_green('{}:'.format(host)))
            print(term.plain(_output))

        if len(self._result['unreachable']) > 0:
            print(term.bold_yellow('Unreachable:'))
            for host, out in self._result['unreachable'].items():
                _output = 'stdout: {}'.format(out['stdout'])
                if len(out['stderr']) > 0:
                    _output += '\nstderr: {}'.format(out['stderr'])
                print(term.plain_yellow('{}:'.format(host)))
                print(term.plain(_output))

        if len(self._result['failed']) > 0:
            print(term.bold_red('Failed:'))
            for host, out in self._result['failed'].items():
                _output = 'stdout: {}'.format(out['stdout'])
                if len(out['stderr']) > 0:
                    _output += '\nstderr: {}'.format(out['stderr'])
                print(term.plain_red('{}:'.format(host)))
                print(term.plain(_output))

    def __run(self, pattern=None, node=None, sudo=False, cmd=None):
        try:
            _result = self.act.run_shell(pattern=pattern,
                                         node=node,
                                         sudo=sudo,
                                         cmd=cmd)
        except exceptions.TiOPSRuntimeError as e:
            term.error('Error execute command: {}'.format(e))
            _result = e.ctx

        for host, out in _result['success'].items():
            if not host in self._result['success'].keys():
                self._result['success'][host] = out
        for host, out in _result['failed'].items():
            if not host in self._result['failed'].keys():
                self._result['failed'][host] = out
        for host, out in _result['unreachable'].items():
            if not host in self._result['unreachable'].keys():
                self._result['unreachable'][host] = out
コード例 #12
0
def main(args=None):
    try:
        action = args.action
    except AttributeError:
        pass

    if action == 'version':
        print(term.plain(TiOPSVer()))
        exit(0)

    if action == 'quickdeploy':
        term.warn(
            'The quick deploy mode is for demo and testing, do NOT use in production!'
        )

        # do init
        _init = init.Init(args)
        try:
            _init.init(demo=True)
            _init.init_network(demo=True)
            _init.init_host(demo=True)
        except TiOPSRuntimeError as e:
            tierror(e)
        except TiOPSException as e:
            term.debug(traceback.format_exc())
            term.fatal(str(e))
            sys.exit(1)

        # do deploy
        topo = topology.Topology(args=args, merge=True)
        try:
            op.OprDeploy(args, topo, demo=True).do()
            op.OprStart(args, topo, demo=True).do()
            tm.TUIModule(topo, args=args).display()
        except TiOPSRuntimeError as e:
            tierror(e)
        except TiOPSRequestError as e:
            msg = "{}, URL {} returned {}, please check the network and try again.".format(
                e.msg, e.url, e.code)
            term.error(msg)
            sys.exit(1)
        except TiOPSException as e:
            term.debug(traceback.format_exc())
            term.fatal(str(e))
            sys.exit(1)

    elif action == 'bootstrap-local':
        _init = init.Init(args)
        try:
            _init.init()
        except TiOPSRuntimeError as e:
            tierror(e)
        except TiOPSException as e:
            term.debug(traceback.format_exc())
            term.fatal(str(e))
            sys.exit(1)
    elif action == 'bootstrap-ssh':
        _init = init.Init(args)
        try:
            _init.init_network()
        except TiOPSRuntimeError as e:
            tierror(e)
        except TiOPSException as e:
            term.debug(traceback.format_exc())
            term.fatal(str(e))
            sys.exit(1)
    elif action == 'bootstrap-host':
        _init = init.Init(args)
        try:
            _init.init_host()
        except TiOPSRuntimeError as e:
            tierror(e)
        except TiOPSException as e:
            term.debug(traceback.format_exc())
            term.fatal(str(e))
            sys.exit(1)
    else:
        try:
            if action not in ['deploy', 'display']:
                topo = topology.Topology(args)
        except TiOPSRuntimeError as e:
            tierror(e)
        except TiOPSException as e:
            term.debug(traceback.format_exc())
            term.fatal(str(e))
            sys.exit(1)

        if action == 'display':
            try:
                _cluster_name = args.cluster_name
            except AttributeError:
                _cluster_name = None
            try:
                if _cluster_name and len(_cluster_name) > 0:
                    topo = topology.Topology(args)
                    _list = False
                else:
                    topo = None
                    _list = True
                tm.TUIModule(topo, args=args).display(_list)
            except TiOPSRuntimeError as e:
                tierror(e)
            except TiOPSException as e:
                term.debug(traceback.format_exc())
                term.fatal(str(e))
                sys.exit(1)
        elif action == 'deploy':
            topo = topology.Topology(args=args, merge=True)
            try:
                op.OprDeploy(args, topo).do()
                tm.TUIModule(topo, args=args).display()
            except TiOPSRuntimeError as e:
                tierror(e)
            except TiOPSRequestError as e:
                msg = "{}, URL {} returned {}, please check the network and try again.".format(
                    e.msg, e.url, e.code)
                term.error(msg)
                sys.exit(1)
            except TiOPSException as e:
                term.debug(traceback.format_exc())
                term.fatal(str(e))
                sys.exit(1)
        elif action == 'start':
            try:
                op.OprStart(args, topo).do(node=args.node_id, role=args.role)
                tm.TUIModule(topo, args=args, status=True).display()
            except TiOPSRuntimeError as e:
                tierror(e)
            except TiOPSException as e:
                term.debug(traceback.format_exc())
                term.fatal(str(e))
                sys.exit(1)
        elif action == 'stop':
            try:
                op.OprStop(args, topo).do(node=args.node_id, role=args.role)
            except TiOPSRuntimeError as e:
                tierror(e)
            except TiOPSException as e:
                term.debug(traceback.format_exc())
                term.fatal(str(e))
                sys.exit(1)
        elif action == 'restart':
            try:
                op.OprRestart(args, topo).do(node=args.node_id, role=args.role)
            except TiOPSRuntimeError as e:
                tierror(e)
            except TiOPSException as e:
                term.debug(traceback.format_exc())
                term.fatal(str(e))
                sys.exit(1)
        elif action == 'reload':
            try:
                op.OprReload(args, topo).do(node=args.node_id, role=args.role)
            except TiOPSRuntimeError as e:
                tierror(e)
            except TiOPSException as e:
                term.debug(traceback.format_exc())
                term.fatal(str(e))
                sys.exit(1)
        elif action == 'upgrade':
            try:
                op.OprUpgrade(args, topo).do(node=args.node_id, role=args.role)
            except TiOPSRuntimeError as e:
                tierror(e)
            except TiOPSRequestError as e:
                msg = "{}, URL {} returned {}, please check the network and try again.".format(
                    e.msg, e.url, e.code)
                term.error(msg)
                sys.exit(1)
            except TiOPSException as e:
                term.debug(traceback.format_exc())
                term.fatal(str(e))
                sys.exit(1)
        elif action == 'destroy':
            try:
                op.OprDestroy(args, topo).do()
            except TiOPSRuntimeError as e:
                tierror(e)
            except TiOPSException as e:
                term.debug(traceback.format_exc())
                term.fatal(str(e))
                sys.exit(1)
        elif action == 'edit-config':
            try:
                Action(topo=topo).edit_file()
            except TiOPSRuntimeError as e:
                tierror(e)
            except TiOPSException as e:
                term.debug(traceback.format_exc())
                term.fatal(str(e))
                sys.exit(1)
        elif action == 'scale-out':
            addTopo = utils.read_yaml(args.topology)
            try:
                op.OprScaleOut(args, topo, addTopo).do()
            except TiOPSRuntimeError as e:
                tierror(e)
            except TiOPSException as e:
                term.debug(traceback.format_exc())
                term.fatal(str(e))
                sys.exit(1)
        elif action == 'scale-in':
            try:
                op.OprScaleIn(args, topo, args.node_id).do(node=args.node_id)
            except TiOPSRuntimeError as e:
                tierror(e)
            except TiOPSException as e:
                term.debug(traceback.format_exc())
                term.fatal(str(e))
                sys.exit(1)
        elif action == 'exec':
            try:
                op.OprExec(args, topo).do(node=args.node_id, role=args.role)
            except TiOPSRuntimeError as e:
                tierror(e)
            except TiOPSException as e:
                term.debug(traceback.format_exc())
                term.fatal(str(e))
                sys.exit(1)
コード例 #13
0
class OprDeploy(OperationBase):
    def __init__(self, args=None, topology=None, demo=False):
        super(OprDeploy, self).__init__(args, topology, demo=demo)
        self.act = Action(ans=self.ans, topo=self.topology)
        self.demo = demo

    def _check_config(self):
        _servers = [
            {
                'pd': 'pd_servers'
            },
            {
                'tikv': 'tikv_servers'
            },
            {
                'tidb': 'tidb_servers'
            },
        ]

        for _service in _servers:
            _component, _pattern = self.check_exist(_service,
                                                    config=self.topology())
            if not _component and not _pattern:
                continue
            term.normal('Check {} configuration.'.format(_component))
            self.act.configCheck(component=_component,
                                 pattern=_pattern,
                                 node=self.topology()[_pattern][0]['uuid'])

    def _prepare(self, component=None, pattern=None, node=None, role=None):
        if self.topology.version and self._args.tidb_version:
            new_ver = self._args.tidb_version.lstrip('v')
            curr_ver = self.topology.version.lstrip('v')
            _cmp = semver.compare(curr_ver, new_ver)
            if _cmp > 0:
                raise exceptions.TiOPSArgumentError(
                    'Running version is {}, can\'t downgrade.'.format(
                        curr_ver))

        term.notice('Begin installing TiDB cluster.')
        # download packages
        term.info(
            'Downloading TiDB related binary, it may take a few minutes.')
        try:
            _local = self._args.local_pkg
        except AttributeError:
            _local = None
        self.act.download(local_pkg=_local)

        if not self.demo:
            # edit config
            self.act.edit_file()

            term.info('Check ssh connection.')
            self.act.check_ssh_connection()

            if self._args.enable_check_config:
                self._check_config()

    def _process(self, component=None, pattern=None, node=None, role=None):
        # creart directory
        term.info('Create directory in all nodes.')
        for service in self.topology.service_group:
            component, pattern = self.check_exist(service,
                                                  config=self.topology())
            if not component and not pattern:
                continue
            self.act.create_directory(component=component, pattern=pattern)

        if not self.demo:
            self.act.check_machine_config()

        # start run deploy
        if self.demo:
            term.warn(
                'FirewallD is being disabled on deployment machines in quick deploy mode.'
            )
        for service in self.topology.service_group:
            component, pattern = self.check_exist(service,
                                                  config=self.topology())
            if not component and not pattern:
                continue
            term.normal('Deploy {}.'.format(component))
            self.act.deploy_component(component=component, pattern=pattern)
            self.act.deploy_firewall(component=component, pattern=pattern)

        if not self.demo:
            self.act.deploy_tool()

    def _post(self, component=None, pattern=None, node=None, role=None):
        self.topology.set_meta()
        self.topology._save_topology()
        if self.demo:
            term.notice('Finished deploying TiDB cluster {} ({}).'.format(
                self.topology.cluster_name, self.topology.version))
        else:
            term.notice(
                'Finished deploying TiDB cluster {} ({}), don\'t forget to start it.'
                .format(self.topology.cluster_name, self.topology.version))
コード例 #14
0
ファイル: base.py プロジェクト: nrc/tiup
    def check_tombstone(self, topology=None, args=None):
        if not topology:
            topology = self.topology
        if not args:
            args = self._args
        _remove_uuid = []
        _cluster = ClusterAPI(topology)
        _binlog = BinlogAPI(topology)

        if _cluster.tikv_stores() and _cluster.tikv_tombstone():
            # get tombstone tikv node
            for _node in topology()['tikv_servers']:
                _tombstone = False
                if not _node['offline']:
                    continue

                # online tikv node list
                _online_list = [
                    x['store']['address']
                    for x in _cluster.tikv_stores()['stores']
                ]
                # tombstone status tikv list
                _tombstone_list = [
                    x['store']['address']
                    for x in _cluster.tikv_tombstone()['stores']
                ]

                _address = '{}:{}'.format(_node['ip'], _node['port'])

                # if node is online, skip it
                if _address in _online_list:
                    continue
                # if node is tombstone, will delete it from topology
                elif _address in _tombstone_list:
                    _remove_uuid.append(_node['uuid'])

        if _binlog.pump_status:
            # get tombstone pump node
            for _node in topology()['pump_servers']:
                _tombstone = False
                if not _node['offline']:
                    continue

                _online_list = [
                    x['nodeId']
                    for x in _binlog.pump_status['status'].itervalues()
                    if x['state'] != 'offline'
                ]
                _tombstone_list = [
                    x['nodeId']
                    for x in _binlog.pump_status['status'].itervalues()
                    if x['state'] == 'offline'
                ]

                if _node['uuid'] in _online_list:
                    continue
                elif _node['uuid'] in _tombstone_list:
                    _remove_uuid.append(_node['uuid'])

            for _node in topology()['drainer_servers']:
                _tombstone = False
                if not _node['offline']:
                    continue

                _online_list = [
                    x['nodeId'] for x in _binlog.drainer_status
                    if x['state'] != 'offline'
                ]
                _tombstone_list = [
                    x['nodeId'] for x in _binlog.drainer_status
                    if x['state'] == 'offline'
                ]

                if _node['uuid'] in _online_list:
                    continue
                elif _node['uuid'] in _tombstone_list:
                    _remove_uuid.append(_node['uuid'])

        if not _remove_uuid:
            return

        _new_topo, _diff = topology.remove(','.join(_remove_uuid), delete=True)
        ans = ansibleapi.ANSRunner(user=topology.user,
                                   topology=_diff,
                                   tiargs=args)
        act = Action(ans=ans, topo=topology)
        for service in [{
                'drainer': 'drainer_servers'
        }, {
                'pump': 'pump_servers'
        }, {
                'tikv': 'tikv_servers'
        }]:
            component, pattern = self.check_exist(service, _diff)
            if not component and not pattern:
                continue
            act.stop_component(component=component, pattern=pattern)
            act.destroy_component(component=component, pattern=pattern)

        topology.replace(_new_topo)
コード例 #15
0
class OprReload(OperationBase):
    def __init__(self, args=None, topology=None):
        super(OprReload, self).__init__(args, topology)
        self.act = Action(ans=self.ans, topo=self.topology)

    def _process(self, component=None, pattern=None, node=None, role=None):
        if node:
            term.notice('Reload specified node in cluster.')
        elif role:
            term.notice('Reload specified role in cluster.')
        else:
            term.notice('Reload TiDB cluster.')
        _topology = self.topology.role_node(roles=role, nodes=node)

        _cluster = modules.ClusterAPI(topology=self.topology)
        _unhealth_node = []
        for _pd_node in _cluster.status():
            if not _pd_node['health']:
                _unhealth_node.append(_pd_node['name'])
                msg = 'Some pd node is unhealthy, maybe server stoppd or network unreachable, unhealthy node list: {}'.format(
                    ','.join(_unhealth_node))
                term.fatal(msg)
                raise exceptions.TiOPSRuntimeError(msg, operation='reload')

        term.info('Check ssh connection.')
        self.act.check_ssh_connection()
        # every time should only contain one item
        for service in self.topology.service_group:
            component, pattern = self.check_exist(service=service,
                                                  config=_topology)
            if not component and not pattern:
                continue
            # upgrade pd server, upgrade leader node finally
            if component == 'pd':
                _pd_list = []
                for _node in _topology[pattern]:
                    if _node['uuid'] == _cluster.pd_leader():
                        _leader = _node
                    else:
                        _pd_list.append(_node)
                _pd_list.append(_leader)

                for _node in _pd_list:
                    _uuid = _node['uuid']
                    _host = _node['ip']
                    term.normal('Reload {}, node id: {}.'.format(
                        component, _uuid))
                    if _uuid == _cluster.pd_leader():
                        _cluster.evict_pd_leader(uuid=_uuid)

                    self.act.deploy_component(component=component,
                                              pattern=pattern,
                                              node=_uuid)
                    self.act.stop_component(component=component,
                                            pattern=pattern,
                                            node=_uuid)
                    self.act.start_component(component=component,
                                             pattern=pattern,
                                             node=_uuid)
                continue

            if pattern in [
                    'monitored_servers', 'monitoring_server', 'grafana_server',
                    'alertmanager_server'
            ]:
                if not node:
                    term.normal('Reload {}.'.format(component))
                    self.act.deploy_component(component=component,
                                              pattern=pattern)
                    self.act.stop_component(component=component,
                                            pattern=pattern)
                    self.act.start_component(component=component,
                                             pattern=pattern)
                else:
                    _uuid = [x['uuid'] for x in _topology[pattern]]
                    term.normal('Reload {}, node list: {}.'.format(
                        component, ','.join(_uuid)))
                    self.act.deploy_component(component=component,
                                              pattern=pattern,
                                              node=','.join(_uuid))
                    self.act.stop_component(component=component,
                                            pattern=pattern,
                                            node=','.join(_uuid))
                    self.act.start_component(component=component,
                                             pattern=pattern,
                                             node=','.join(_uuid))
                continue

            for _node in _topology[pattern]:
                _uuid = _node['uuid']
                _host = _node['ip']
                term.normal('Reload {}, node id: {}.'.format(component, _uuid))
                if pattern == 'tikv_servers':
                    _port = _node['port']
                    _cluster.evict_store_leaders(host=_host, port=_port)
                self.act.deploy_component(component=component,
                                          pattern=pattern,
                                          node=_uuid)
                self.act.stop_component(component=component,
                                        pattern=pattern,
                                        node=_uuid)
                self.act.start_component(component=component,
                                         pattern=pattern,
                                         node=_uuid)

                if pattern == 'tikv_servers':
                    _cluster.remove_evict(host=_host, port=_port)

    def _post(self, component=None, pattern=None, node=None, role=None):
        term.notice('Finished reload config for {} cluster.'.format(
            self.topology.version))
コード例 #16
0
ファイル: stop.py プロジェクト: nrc/tiup
 def __init__(self, args=None, topology=None):
     super(OprStop, self).__init__(args, topology)
     self.act = Action(ans=self.ans, topo=self.topology)
コード例 #17
0
ファイル: start.py プロジェクト: nrc/tiup
 def __init__(self, args=None, topology=None, demo=False):
     super(OprStart, self).__init__(args, topology, demo=demo)
     self.act = Action(ans=self.ans, topo=self.topology)
     self.demo = demo
コード例 #18
0
class OprDestroy(OperationBase):
    def __init__(self, args=None, topology=None):
        super(OprDestroy, self).__init__(args, topology)
        self.act = Action(ans=self.ans, topo=self.topology)

    def _prepare(self, component=None, pattern=None, node=None, role=None):
        term.warn('The TiDB cluster {} ({}) is going to be destroyed.'.format(
            self.topology.cluster_name, self.topology.version))
        rm_promt = 'This operation will ' + term.warn_red('remove') \
                   + ' the TiDB cluster ' + term.highlight_red(self.topology.cluster_name) \
                   + '. It can NOT be undone. ' + term.yes_no() + ':'
        notice = term.input(rm_promt)
        if notice.lower() not in ['y', 'yes']:
            term.notice('Terminate the destroy operation.')
            raise exceptions.TiOPSRuntimeError('Operation cancelled by user.')

    def _process(self, component=None, pattern=None, node=None, role=None):
        term.info('Check ssh connection.')
        self.act.check_ssh_connection()
        term.info('Stopping TiDB cluster.')
        for service in self.topology.service_group[::-1]:
            component, pattern = self.check_exist(service,
                                                  config=self.topology())
            if not component and not pattern:
                continue
            try:
                self.act.stop_component(component=component,
                                        pattern=pattern,
                                        node=node)
            except exceptions.TiOPSWarning as e:
                term.debug(str(e))
                pass

        for service in self.topology.service_group[::-1]:
            component, pattern = self.check_exist(service,
                                                  config=self.topology())
            if not component and not pattern:
                continue
            term.normal('{} is being destroyed.'.format(component))
            try:
                self.act.destroy_component(component=component,
                                           pattern=pattern,
                                           node=node)
            except exceptions.TiOPSWarning as e:
                term.debug(str(e))
                pass

        # remove deploy dir
        self.ans.run_model('shell',
                           'rm -rf {{ full_deploy_dir | cluster_dir }}',
                           become=True,
                           group='*')

        self.ans.run_model('shell',
                           'rm -rf {{ full_data_dir | cluster_dir }}',
                           become=True,
                           group='*')

    def _post(self, component=None, pattern=None, node=None, role=None):
        try:
            utils.remove_dir(utils.profile_path(self.topology.cluster_dir))
        except Exception as e:
            logging.warning(e)

        term.notice('TiDB cluster destroyed.')