def cstop(self, master=True, slave=True): """Stop current masters or slaves """ center = Center() center.update_ip_port() success = center.check_hosts_connection() if not success: return center.stop_current_nodes(master, slave)
def rowcount(self): """Query and show cluster row count """ logger.debug('rowcount') masters = [] center = Center() center.update_ip_port() master_nodes = center.get_master_obj_list() for master_node in master_nodes: node = master_node['addr'] masters.append(node) # open-redis-cli-all info Tablespace | grep totalRows | awk -F ', # ' '{print $4}' | awk -F '=' '{sum += $2} END {print sum}' ret = RedisCliUtil.command_all_async('info Tablespace', slave=True) outs = '' for _, host, port, res, stdout in ret: if res == 'OK': endpoint = '{}:{}'.format(host, port) if endpoint in masters: outs = '\n'.join([outs, stdout]) else: logger.warning("FAIL {}:{} {}".format(host, port, stdout)) lines = outs.splitlines() key = 'totalRows' partitions = 'partitions' evictions = 'evictedRows' filtered_lines = (filter(lambda x: key in x, lines)) #self._print(filtered_lines) # Table list table_list = [] result = [] for line in filtered_lines: tableStats, _ = line.split(':') tableId = tableStats.split('_') if tableId[1] in table_list: pass else: table_list.append(tableId[1]) for tid in table_list: table_lines = (filter(lambda x: tid in x, filtered_lines)) ld = RedisCliUtil.to_list_of_dict(table_lines) row_count = reduce(lambda x, y: x + int(y[key]), ld, 0) partitions_count = reduce(lambda x, y: x + int(y[partitions]), ld, 0) evictions_count = reduce(lambda x, y: x + int(y[evictions]), ld, 0) result.append([tid, row_count, partitions_count, evictions_count]) utils.print_table( [['Table_ID', 'ROW_COUNT', 'PARTITION_COUNT', 'EVICTED_ROWS']] + result)
def start(self, profile=False, master=True, slave=True): """Start cluster :param master: If exclude master cluster, set False :param slave: If exclude slave cluster, set False """ logger.debug("command 'cluster start'") if not isinstance(profile, bool): msg = message.get('error_option_type_not_boolean') msg = msg.format(option='profile') logger.error(msg) return if not isinstance(master, bool): msg = message.get('error_option_type_not_boolean') msg = msg.format(option='master') logger.error(msg) return if not isinstance(slave, bool): msg = message.get('error_option_type_not_boolean') msg = msg.format(option='slave') logger.error(msg) return center = Center() center.update_ip_port() success = center.check_hosts_connection() if not success: return center.ensure_cluster_exist() if master: master_alive_count = center.get_alive_master_redis_count() master_alive_count_mine = center.get_alive_master_redis_count( check_owner=True) not_mine_count = master_alive_count - master_alive_count_mine if not_mine_count > 0: msg = message.get('error_cluster_start_master_collision') msg = '\n'.join(msg).format(count=not_mine_count) raise LightningDBError(11, msg) if slave: slave_alive_count = center.get_alive_slave_redis_count() slave_alive_count_mine = center.get_alive_slave_redis_count( check_owner=True) not_mine_count = slave_alive_count - slave_alive_count_mine if not_mine_count > 0: msg = message.get('error_cluster_start_slave_collision') msg = '\n'.join(msg).format(count=not_mine_count) raise LightningDBError(12, msg) center.backup_server_logs(master=master, slave=slave) center.create_redis_data_directory() # equal to cluster.configure() center.configure_redis() center.sync_conf(show_result=True) center.start_redis_process(profile, master=master, slave=slave) center.wait_until_all_redis_process_up(master=master, slave=slave)
def configure(self): """Configure cluster Make conf file of redis with redis properties information. """ center = Center() center.update_ip_port() success = center.check_hosts_connection() if not success: return center.configure_redis() center.sync_conf(show_result=True)
def thriftserver(self): """Edit 'thriftserver.properties' """ cluster_id = config.get_cur_cluster_id() path_of_fb = config.get_path_of_fb(cluster_id) target_path = path_of_fb['thrift_properties'] self._edit_conf(target_path, syntax='sh') center = Center() center.update_ip_port() success = center.check_hosts_connection() if not success: return success = center.sync_file(target_path) if success: msg = message.get('complete_conf_edit') logger.info(msg)
def slave(self): """Edit 'redis-slave.conf.template' """ cluster_id = config.get_cur_cluster_id() path_of_fb = config.get_path_of_fb(cluster_id) target_path = path_of_fb['slave_template'] self._edit_conf(target_path, syntax='sh') center = Center() center.update_ip_port() success = center.check_hosts_connection() if not success: return success = center.sync_file(target_path) if success: msg = message.get('complete_conf_edit') logger.info(msg)
def force_failover(self, server): """ Find all masters on the server and convert them to slaves. Finally, in the server, only slaves will be remained. :param server: IP or hostname """ logger.debug('force_failover') center = Center() center.update_ip_port() master_nodes = center.get_master_obj_list() cluster_id = config.get_cur_cluster_id() lib_path = config.get_ld_library_path(cluster_id) path_of_fb = config.get_path_of_fb(cluster_id) sr2_redis_bin = path_of_fb['sr2_redis_bin'] env_cmd = [ 'GLOBIGNORE=*;', 'export LD_LIBRARY_PATH={};'.format(lib_path['ld_library_path']), 'export DYLD_LIBRARY_PATH={};'.format( lib_path['dyld_library_path']), ] redis_cli_cmd = os.path.join(sr2_redis_bin, 'redis-cli') outs = '' meta = [] m_endpoint = [] for node in master_nodes: addr = node['addr'] (host, port) = addr.split(':') # if host == server: if self.compare_ip(host, server): for slave_node in node['slaves']: addr = slave_node['addr'] (s_host, s_port) = addr.split(':') sub_cmd = 'cluster failover takeover' command = '{} {} -h {} -p {} {}'.format( ' '.join(env_cmd), redis_cli_cmd, s_host, s_port, sub_cmd, ) self._print( message.get('try_failover_takeover').format( slave=addr)) stdout = subprocess.check_output(command, shell=True) self._print(stdout)
def failback(self): """Restart disconnected redis """ center = Center() center.update_ip_port() master_obj_list = center.get_master_obj_list() disconnected_list = [] paused_list = [] for master in master_obj_list: if master['status'] == 'disconnected': disconnected_list.append(master['addr']) if master['status'] == 'paused': paused_list.append(master['addr']) for slave in master['slaves']: if slave['status'] == 'disconnected': disconnected_list.append(slave['addr']) if slave['status'] == 'paused': paused_list.append(slave['addr']) classified_disconnected_list = {} classified_paused_list = {} for disconnected in disconnected_list: host, port = disconnected.split(':') if host not in classified_disconnected_list: classified_disconnected_list[host] = [] classified_disconnected_list[host].append(port) for paused in paused_list: host, port = paused.split(':') if host not in classified_paused_list: classified_paused_list[host] = [] classified_paused_list[host].append(port) current_time = time.strftime("%Y%m%d-%H%M", time.gmtime()) for host, ports in classified_disconnected_list.items(): msg = message.get('redis_run') msg = msg.format(host=host, ports='|'.join(ports)) logger.info(msg) center.run_redis_process(host, ports, False, current_time) for host, ports in classified_paused_list.items(): msg = message.get('redis_restart') msg = msg.format(host=host, ports='|'.join(ports)) logger.info(msg) center.stop_redis_process(host, ports) center.run_redis_process(host, ports, False, current_time) if not classified_disconnected_list and not classified_paused_list: msg = message.get('already_all_redis_alive') logger.info(msg)
def distribution(self): """Check the distribution of all masters and slaves """ center = Center() center.update_ip_port() logger.debug('distribution') ret = RedisCliUtil.command_all_async('cluster nodes', slave=True) outs = '' for _, host, port, res, stdout in ret: if res == 'OK': outs = '\n'.join([outs, stdout]) lines = outs.splitlines() myself_key = 'myself' filtered_lines = (filter(lambda x: myself_key in x, lines)) else: logger.warning("FAIL {}:{} {}".format(host, port, stdout)) meta = [] total_masters = 0 total_slaves = 0 for nd in center.master_host_list: num_of_masters = 0 num_of_slaves = 0 node = socket.gethostbyname(nd) host_lines = (filter(lambda x: (node + ':') in x, filtered_lines)) for node in host_lines: params = node.split() endpoint = params[1] roles = params[2] host = endpoint.split(':')[0] role = roles.split(',')[1] if role == 'master': if len(params) == 9: num_of_masters += 1 else: num_of_slaves += 1 total_masters += num_of_masters total_slaves += num_of_slaves hostname = str(socket.gethostbyaddr(host)[0]) + str('(') + str( host) + str(')') meta.append([hostname, num_of_masters, num_of_slaves]) meta.append(['TOTAL', total_masters, total_slaves]) utils.print_table([['HOST', 'MASTER', 'SLAVE']] + meta)
def delete(self, cluster_id): """Delete cluster It is automatically backed up with timestamps as tags :param cluster_id: target cluster id """ if not cluster_util.validate_id(cluster_id): raise ClusterIdError(cluster_id) path_of_fb = config.get_path_of_fb(cluster_id) props_path = path_of_fb['redis_properties'] hosts = config.get_props(props_path, 'sr2_redis_master_hosts', []) tag = time.strftime("%Y%m%d%H%M%S", time.gmtime()) cluster_backup_dir = 'cluster_{}_bak_{}'.format(cluster_id, tag) for host in hosts: Center().cluster_backup(host, cluster_id, cluster_backup_dir) msg = message.get('cluster_delete_complete') msg = msg.format(cluster_id=cluster_id) logger.info(msg)
def clean(self, logs=False): """Clean cluster Delete redis config, data, node configuration. :param log: Delete log of redis """ if not isinstance(logs, bool): msg = message.get('error_option_type_not_boolean') msg = msg.format(option='logs') logger.error(msg) return center = Center() center.update_ip_port() if logs: center.remove_all_of_redis_log_force() return center.cluster_clean() msg = message.get('apply_after_restart') logger.info(msg)
def reset_distribution(self): """ Reset the distribution of masters and slaves with original setting """ center = Center() center.update_ip_port() logger.debug('reset_distribution') cluster_id = config.get_cur_cluster_id() lib_path = config.get_ld_library_path(cluster_id) path_of_fb = config.get_path_of_fb(cluster_id) sr2_redis_bin = path_of_fb['sr2_redis_bin'] env_cmd = [ 'GLOBIGNORE=*;', 'export LD_LIBRARY_PATH={};'.format(lib_path['ld_library_path']), 'export DYLD_LIBRARY_PATH={};'.format( lib_path['dyld_library_path']), ] redis_cli_cmd = os.path.join(sr2_redis_bin, 'redis-cli') slave_nodes = center.get_slave_nodes() master_ports = center.master_port_list for slave_node in slave_nodes: (host, port) = slave_node.split(':') try: value = int(port) if value in master_ports: # failover takeover msg = message.get('try_failover_takeover').format( slave=slave_node) self._print(msg) sub_cmd = 'cluster failover takeover' command = '{} {} -h {} -p {} {}'.format( ' '.join(env_cmd), redis_cli_cmd, host, port, sub_cmd, ) stdout = subprocess.check_output(command, shell=True) outs = '' outs = '\n'.join([outs, stdout]) self._print(outs) except ValueError: pass
def nodes_with_dir(self, server, dir): """Find nodes that use the specified directory path :param server: IP or hostname :param dir: directory path """ center = Center() center.update_ip_port() logger.debug('nodes_with_dir') ret = RedisCliUtil.command_all_async('config get dir', slave=True) outs = '' meta = [] for _, host, port, res, stdout in ret: if res == 'OK': flat_stdout = '\n'.join([outs, stdout]) line = flat_stdout.splitlines() if self.compare_ip(host, server) and dir in line[2]: meta.append([host, port, line[2]]) else: logger.warning("FAIL {}:{} {}".format(host, port, stdout)) utils.print_table([['HOST', 'PORT', 'PATH']] + meta)
def forget_noaddr(self): """Forget noaddr nodes that is not used anymore in cluster """ center = Center() center.update_ip_port() logger.debug('forget_noaddr') ret = RedisCliUtil.command_all_async('cluster nodes', slave=True) outs = '' meta = [] for _, host, port, res, stdout in ret: if res == 'OK': outs = '\n'.join([outs, stdout]) lines = outs.splitlines() filtered_lines = (filter(lambda x: 'noaddr' in x, lines)) else: logger.warning("FAIL {}:{} {}".format(host, port, stdout)) total_list = [] for line in filtered_lines: total_list.append(line.split()[0]) # Remove duplicates unique_list = list(set(total_list)) # Forget noaddr uuid for uuid in unique_list: sub_cmd = 'cluster forget "{id}" 2>&1'.format(id=uuid) ret = RedisCliUtil.command_all_async(sub_cmd, slave=True) count = 0 for _, host, port, res, stdout in ret: if res == 'OK': count += 1 pass else: logger.warning("FAIL {}:{} {}".format(host, port, stdout)) msg = '{num} nodes have forgot {id}'.format(num=count, id=uuid) self._print(msg)
def _deploy(cluster_id, history_save, clean): deploy_state = DeployUtil().get_state(cluster_id) if deploy_state == DEPLOYED: msg = message.get('ask_deploy_again') msg = msg.format(cluster_id=cluster_id) msg = color.yellow(msg) yes = ask_util.askBool(msg, default='n') if not yes: logger.info(message.get('cancel')) return restore_yes = None no_localhost = False current_time = time.strftime("%Y%m%d%H%M%S", time.gmtime()) cluster_backup_dir = 'cluster_{}_bak_{}'.format(cluster_id, current_time) conf_backup_dir = 'cluster_{}_conf_bak_{}'.format(cluster_id, current_time) tmp_backup_dir = 'cluster_{}_conf_bak_{}'.format(cluster_id, 'tmp') meta = [['NAME', 'VALUE']] path_of_fb = config.get_path_of_fb(cluster_id) conf_path = path_of_fb['conf_path'] props_path = path_of_fb['redis_properties'] cluster_path = path_of_fb['cluster_path'] path_of_cli = config.get_path_of_cli(cluster_id) conf_backup_path = path_of_cli['conf_backup_path'] tmp_backup_path = os.path.join(conf_backup_path, tmp_backup_dir) local_ip = config.get_local_ip() # ask installer installer_path = ask_util.installer() installer_name = os.path.basename(installer_path) meta.append(['installer', installer_name]) # ask restore conf if deploy_state == DEPLOYED: restore_yes = ask_util.askBool(message.get('ask_restore_conf')) meta.append(['restore', restore_yes]) # input props hosts = [] if deploy_state == DEPLOYED: if restore_yes: meta += DeployUtil().get_meta_from_props(props_path) hosts = config.get_props(props_path, 'sr2_redis_master_hosts') else: if not os.path.isdir(conf_backup_path): os.mkdir(conf_backup_path) if os.path.exists(tmp_backup_path): msg = message.get('ask_load_history_of_previous_modification') yes = ask_util.askBool(msg) if not yes: shutil.rmtree(tmp_backup_path) if not os.path.exists(tmp_backup_path): os.mkdir(tmp_backup_path) shutil.copy(os.path.join(conf_path, 'redis.properties'), os.path.join(tmp_backup_path, 'redis.properties')) tmp_props_path = os.path.join(tmp_backup_path, 'redis.properties') editor.edit(tmp_props_path, syntax='sh') meta += DeployUtil().get_meta_from_props(tmp_props_path) hosts = config.get_props(tmp_props_path, 'sr2_redis_master_hosts') else: # new deploy props_dict = ask_util.props(cluster_id, save=history_save) hosts = props_dict['hosts'] meta += DeployUtil().get_meta_from_dict(props_dict) utils.print_table(meta) msg = message.get('confirm_deploy_information') yes = ask_util.askBool(msg) if not yes: logger.info(message.get('cancel')) return # check node status success = Center().check_hosts_connection(hosts, True) if not success: msg = message.get('error_exist_unavailable_host') logger.error(msg) return logger.debug('Connection of all hosts ok.') success = Center().check_include_localhost(hosts) if not success: no_localhost = True # get port info if deploy_state == DEPLOYED: if restore_yes: key = 'sr2_redis_master_ports' m_ports = config.get_props(props_path, key, []) key = 'sr2_redis_slave_ports' s_ports = config.get_props(props_path, key, []) replicas = len(s_ports) // len(m_ports) else: key = 'sr2_redis_master_ports' m_ports = config.get_props(tmp_props_path, key, []) key = 'sr2_redis_slave_ports' s_ports = config.get_props(tmp_props_path, key, []) replicas = len(s_ports) // len(m_ports) else: m_ports = props_dict['master_ports'] s_ports = props_dict['slave_ports'] replicas = props_dict['replicas'] while True: msg = message.get('check_port') logger.info(msg) host_ports_list = [] for host in hosts: host_ports_list.append((host, m_ports + s_ports)) conflict = Center().check_port_is_enable(host_ports_list) if not conflict: logger.info("OK") break utils.print_table([["HOST", "PORT"]] + conflict) msg = message.get('ask_port_collision') msg = color.yellow(msg) yes = ask_util.askBool(msg) if yes: logger.info("OK") break m_ports = ask_util.master_ports(False, cluster_id) replicas = ask_util.replicas(False) s_ports = ask_util.slave_ports(cluster_id, len(m_ports), replicas) if deploy_state == DEPLOYED: if restore_yes: key = 'sr2_redis_master_ports' value = cluster_util.convert_list_2_seq(m_ports) config.set_props(props_path, key, value) key = 'sr2_redis_slave_ports' value = cluster_util.convert_list_2_seq(s_ports) config.set_props(props_path, key, value) else: key = 'sr2_redis_master_ports' value = cluster_util.convert_list_2_seq(m_ports) config.set_props(tmp_props_path, key, value) key = 'sr2_redis_slave_ports' value = cluster_util.convert_list_2_seq(s_ports) config.set_props(tmp_props_path, key, value) else: props_dict['master_ports'] = m_ports props_dict['slave_ports'] = s_ports props_dict['replicas'] = replicas # if pending, delete legacy on each hosts if no_localhost: if DeployUtil().get_state(cluster_id, local_ip) == PENDING: client = net.get_ssh(local_ip) command = 'rm -rf {}'.format(cluster_path) net.ssh_execute(client=client, command=command) client.close() for host in hosts: if DeployUtil().get_state(cluster_id, host) == PENDING: client = net.get_ssh(host) command = 'rm -rf {}'.format(cluster_path) net.ssh_execute(client=client, command=command) client.close() # added_hosts = post_hosts - pre_hosts msg = message.get('check_cluster_exist') logger.info(msg) added_hosts = set(hosts) meta = [] if deploy_state == DEPLOYED: pre_hosts = config.get_props(props_path, 'sr2_redis_master_hosts') added_hosts -= set(pre_hosts) can_deploy = True if no_localhost: added_hosts |= set([local_ip]) for host in added_hosts: client = net.get_ssh(host) is_localhost = Center().is_localhost(host) if is_localhost: if no_localhost: continue if os.path.exists(cluster_path + '/remote'): meta.append([host, color.green('CLEAN')]) continue if net.is_exist(client, cluster_path): meta.append([host, color.red('CLUSTER EXIST')]) can_deploy = False continue meta.append([host, color.green('CLEAN')]) if meta: utils.print_table([['HOST', 'STATUS']] + meta) if not can_deploy: msg = message.get('error_cluster_collision') logger.error(msg) return # if not force: # logger.error("If you trying to force, use option '--force'") # return logger.info('OK') # cluster stop and clean if deploy_state == DEPLOYED and clean: center = Center() cur_cluster_id = config.get_cur_cluster_id(allow_empty_id=True) run_cluster_use(cluster_id) center.update_ip_port() center.stop_redis() center.remove_all_of_redis_log_force() center.cluster_clean() run_cluster_use(cur_cluster_id) # backup conf if deploy_state == DEPLOYED: Center().conf_backup(local_ip, cluster_id, conf_backup_dir) # backup cluster backup_hosts = [] if deploy_state == DEPLOYED: backup_hosts += set(pre_hosts) # if force: # backup_hosts += added_hosts for host in backup_hosts: cluster_path = path_of_fb['cluster_path'] client = net.get_ssh(host) Center().cluster_backup(host, cluster_id, cluster_backup_dir) client.close() # transfer & install msg = message.get('transfer_and_execute_installer') logger.info(msg) target_hosts = hosts + [local_ip] if no_localhost else hosts for host in target_hosts: if not (no_localhost and Center().is_localhost(host)): logger.info(' - {}'.format(host)) client = net.get_ssh(host) cmd = 'mkdir -p {0} && touch {0}/.deploy.state'.format(cluster_path) net.ssh_execute(client=client, command=cmd) client.close() DeployUtil().transfer_installer(host, cluster_id, installer_path) try: DeployUtil().install(host, cluster_id, installer_name) except SSHCommandError as ex: msg = message.get('error_execute_installer') msg = msg.format(installer=installer_path) logger.error(msg) logger.exception(ex) return # setup props if deploy_state == DEPLOYED: if restore_yes: tag = conf_backup_dir else: tag = tmp_backup_dir Center().conf_restore(local_ip, cluster_id, tag) else: key = 'sr2_redis_master_hosts' config.make_key_enable(props_path, key) config.set_props(props_path, key, props_dict['hosts']) key = 'sr2_redis_master_ports' config.make_key_enable(props_path, key) value = cluster_util.convert_list_2_seq(props_dict['master_ports']) config.set_props(props_path, key, value) key = 'sr2_redis_slave_hosts' config.make_key_enable(props_path, key) config.set_props(props_path, key, props_dict['hosts']) config.make_key_disable(props_path, key) if props_dict['replicas'] > 0: key = 'sr2_redis_slave_hosts' config.make_key_enable(props_path, key) key = 'sr2_redis_slave_ports' config.make_key_enable(props_path, key) value = cluster_util.convert_list_2_seq(props_dict['slave_ports']) config.set_props(props_path, key, value) key = 'ssd_count' config.make_key_enable(props_path, key) config.set_props(props_path, key, props_dict['ssd_count']) key = 'sr2_redis_data' config.make_key_enable(props_path, key, v1_flg=True) config.make_key_enable(props_path, key, v1_flg=True) config.make_key_disable(props_path, key) config.set_props(props_path, key, props_dict['prefix_of_db_path']) key = 'sr2_redis_db_path' config.make_key_enable(props_path, key, v1_flg=True) config.make_key_enable(props_path, key, v1_flg=True) config.make_key_disable(props_path, key) config.set_props(props_path, key, props_dict['prefix_of_db_path']) key = 'sr2_flash_db_path' config.make_key_enable(props_path, key, v1_flg=True) config.make_key_enable(props_path, key, v1_flg=True) config.make_key_disable(props_path, key) config.set_props(props_path, key, props_dict['prefix_of_db_path']) # synk props msg = message.get('sync_conf') logger.info(msg) for node in hosts: if socket.gethostbyname(node) in config.get_local_ip_list(): continue client = net.get_ssh(node) if not client: msg = message.get('error_ssh_connection').format(host=node) logger.error(msg) return net.copy_dir_to_remote(client, conf_path, conf_path) client.close() # set deploy state complete if os.path.exists(tmp_backup_path): shutil.rmtree(tmp_backup_path) for node in target_hosts: path_of_fb = config.get_path_of_fb(cluster_id) cluster_path = path_of_fb['cluster_path'] client = net.get_ssh(node) cmd = 'rm -rf {}'.format(os.path.join(cluster_path, '.deploy.state')) net.ssh_execute(client=client, command=cmd) client.close() if no_localhost: os.system('touch {}/remote'.format(cluster_path)) msg = message.get('complete_deploy').format(cluster_id=cluster_id) logger.info(msg) Cluster().use(cluster_id) msg = message.get('suggest_after_deploy') logger.info(msg)
def _deploy_zero_downtime(cluster_id): logger.debug("zero downtime update cluster {}".format(cluster_id)) center = Center() center.update_ip_port() m_hosts = center.master_host_list m_ports = center.master_port_list s_hosts = center.slave_host_list s_ports = center.slave_port_list path_of_fb = config.get_path_of_fb(cluster_id) cluster_path = path_of_fb['cluster_path'] # check master alive m_count = len(m_hosts) * len(m_ports) alive_m_count = center.get_alive_master_redis_count() if alive_m_count < m_count: logger.error(message.get('error_exist_disconnected_master')) return if not config.is_slave_enabled: logger.error(message.get('error_need_to_slave')) return # select installer installer_path = ask_util.installer() installer_name = os.path.basename(installer_path) # backup info current_time = time.strftime("%Y%m%d%H%M%S", time.gmtime()) conf_backup_dir = 'cluster_{}_conf_bak_{}'.format(cluster_id, current_time) cluster_backup_dir = 'cluster_{}_bak_{}'.format(cluster_id, current_time) local_ip = config.get_local_ip() # backup conf center.conf_backup(local_ip, cluster_id, conf_backup_dir) # backup cluster for host in s_hosts: client = net.get_ssh(host) center.cluster_backup(host, cluster_id, cluster_backup_dir) client.close() # transfer & install logger.info(message.get('transfer_and_execute_installer')) for host in m_hosts: logger.info(' - {}'.format(host)) client = net.get_ssh(host) cmd = 'mkdir -p {0} && touch {0}/.deploy.state'.format(cluster_path) net.ssh_execute(client=client, command=cmd) client.close() DeployUtil().transfer_installer(host, cluster_id, installer_path) try: DeployUtil().install(host, cluster_id, installer_name) except SSHCommandError as ex: msg = message.get('error_execute_installer') msg = msg.format(installer=installer_path) logger.error(msg) logger.exception(ex) return # restore conf center.conf_restore(local_ip, cluster_id, conf_backup_dir) # set deploy state complete for node in m_hosts: path_of_fb = config.get_path_of_fb(cluster_id) cluster_path = path_of_fb['cluster_path'] client = net.get_ssh(node) cmd = 'rm -rf {}'.format(os.path.join(cluster_path, '.deploy.state')) net.ssh_execute(client=client, command=cmd) client.close() # restart slave center.stop_current_nodes(master=False, slave=True) center.configure_redis() center.sync_conf() center.start_current_nodes(master=False, slave=True) center.wait_until_all_redis_process_up() slaves_for_failover = center.get_slave_nodes() key = 'cluster-node-timeout' origin_m_value = center.cli_config_get(key, m_hosts[0], m_ports[0]) origin_s_value = center.cli_config_get(key, s_hosts[0], s_ports[0]) logger.debug('config set: cluster-node-timeout 2000') RedisCliConfig().set(key, '2000', all=True) # cluster failover (with no option) logger.info(message.get('failover_on_deploy')) logger.debug(slaves_for_failover) try_count = 0 while try_count < 10: try_count += 1 success = True for slave_addr in slaves_for_failover: host, port = slave_addr.split(':') stdout = center.run_failover("{}:{}".format(host, port)) logger.debug("failover {}:{} {}".format(host, port, stdout)) if stdout != "ERR You should send CLUSTER FAILOVER to a slave": # In some cases, the cluster failover is not complete # even if stdout is OK # If redis changed to master completely, # return 'ERR You should send CLUSTER FAILOVER to a slave' success = False if success: break msg = message.get('retry').format(try_count=try_count) logger.info(msg) time.sleep(5) logger.debug('restore config: cluster-node-timeout') center.cli_config_set_all(key, origin_m_value, m_hosts, m_ports) center.cli_config_set_all(key, origin_s_value, s_hosts, s_ports) if not success: logger.error(message.get('error_redis_failover')) return # restart master (current slave) center.stop_current_nodes(master=False, slave=True) center.configure_redis(slave=False) center.sync_conf() center.start_current_nodes(master=False, slave=True) center.wait_until_all_redis_process_up()
def restore(self, cluster_id, tag=None): """Restore cluster :param cluster_id: target cluster id :param tag: Tag of backup, if omitted, restore the most recent backup file """ logger.debug('cluster restore: cluster_id={}, tag={}'.format( cluster_id, tag )) if not cluster_util.validate_id(cluster_id): raise ClusterIdError(cluster_id) # find restore folder with tag (local) path_of_fb = config.get_path_of_fb(cluster_id) cluster_backup_path = path_of_fb['cluster_backup_path'] if tag is None: backup_list = os.listdir(cluster_backup_path) pattern = 'cluster_{}_bak_'.format(cluster_id) filtered = filter(lambda x: x.startswith(pattern), backup_list) sorted_list = sorted(list(filtered)) if not sorted_list: msg = message.get('error_not_found_any_backup') logger.error('BackupNotExistError: ' + msg) return tag = sorted_list[-1] logger.debug("tag option is empty, auto select: {}".format(tag)) cluster_restore_dir = tag backup_path = os.path.join(cluster_backup_path, cluster_restore_dir) if not os.path.isdir(backup_path): msg = message.get('error_not_found_backup').format(tag=tag) logger.error('BackupNotExistError: ' + msg) return # get hosts from cluster props props_path = os.path.join( backup_path, 'tsr2-assembly-1.0.0-SNAPSHOT', 'conf', 'redis.properties' ) hosts = config.get_props(props_path, 'sr2_redis_master_hosts', []) # check status of hosts success = Center().check_hosts_connection(hosts, True) if not success: msg = message.get('error_exist_unavailable_host') logger.error(msg) return logger.debug('Connection of all hosts ok.') success = Center().check_include_localhost(hosts) if not success: msg = message.get('error_not_include_localhost') logger.error(msg) return # check all host tag folder: OK / NOT FOUND msg = message.get('check_backup_info') logger.info(msg) buf = [] for host in hosts: client = net.get_ssh(host) if not net.is_dir(client, backup_path): logger.debug('cannot find backup dir: {}-{}'.format( host, cluster_restore_dir )) buf.append([host, color.red('NOT FOUND')]) client.close() if buf: utils.print_table([['HOST', 'RESULT'] + buf]) return logger.info('OK') # backup cluster new_tag = time.strftime("%Y%m%d%H%M%S", time.gmtime()) cluster_backup_dir = 'cluster_{}_bak_{}'.format(cluster_id, new_tag) for host in hosts: Center().cluster_backup(host, cluster_id, cluster_backup_dir) # restore cluster command = "cp -r {} {}/cluster_{}".format( backup_path, path_of_fb['base_directory'], cluster_id ) for host in hosts: msg = message.get('restore_cluster') msg = msg.format(tag=cluster_backup_dir, host=host) logger.info(msg) client = net.get_ssh(host) net.ssh_execute(client, command) client.close() logger.info("OK")
def add_slave(self, yes=False): """Add slave of cluster Add slaves to cluster that configured master only. :param yes: Skip confirm information """ logger.debug('add_slave') if not isinstance(yes, bool): msg = message.get('error_option_type_not_boolean') msg = msg.format(option='yes') logger.error(msg) return center = Center() center.update_ip_port() # check s_hosts = center.slave_host_list s_ports = center.slave_port_list if not s_hosts: msg = message.get('error_slave_host_empty') raise ClusterRedisError(msg) if not s_ports: msg = message.get('error_slave_port_empty') raise ClusterRedisError(msg) success = center.check_hosts_connection(hosts=s_hosts) if not success: return center.ensure_cluster_exist() slave_alive_count = center.get_alive_slave_redis_count() slave_alive_count_mine = center.get_alive_slave_redis_count( check_owner=True ) not_mine_count = slave_alive_count - slave_alive_count_mine if not_mine_count > 0: msg = message.get('error_cluster_start_slave_collision') msg = '\n'.join(msg).format(count=not_mine_count) raise LightningDBError(12, msg) # confirm info result = center.confirm_node_port_info(skip=yes) if not result: msg = message.get('cancel') logger.warning(msg) return # clean center.cluster_clean(master=False) # backup logs center.backup_server_logs(master=False) center.create_redis_data_directory(master=False) # configure center.configure_redis(master=False) center.sync_conf() # start center.start_redis_process(master=False) center.wait_until_all_redis_process_up() # change redis config temporarily key = 'cluster-node-timeout' origin_s_value = center.cli_config_get(key, s_hosts[0], s_ports[0]) if not origin_s_value: msg = "RedisConfigKeyError: '{}'".format(key) logger.warning(msg) if origin_s_value: # cli config set cluster-node-timeout 2000 logger.debug('set cluster node time out 2000 for create') center.cli_config_set_all(key, '2000', s_hosts, s_ports) # create center.replicate() if origin_s_value: # cli config restore cluster-node-timeout logger.debug('restore cluster node time out') center.cli_config_set_all(key, origin_s_value, s_hosts, s_ports)
def create(self, yes=False): """Create cluster Before create cluster, all redis should be running. :param yes: skip confirm information """ center = Center() center.update_ip_port() success = center.check_hosts_connection() if not success: return m_count = len(center.master_host_list) * len(center.master_port_list) if m_count < 3: msg = message.get('error_master_redis_less_than_3') raise ClusterRedisError(msg) # if need to cluster start alive_count = center.get_alive_all_redis_count() my_alive_count = center.get_alive_all_redis_count(check_owner=True) if alive_count != my_alive_count: msg = message.get('error_cluster_start_port_collision') raise ClusterRedisError(msg) all_count = len(center.all_host_list) if alive_count < all_count: logger.debug('cluster start in create') # init center.backup_server_logs() center.create_redis_data_directory() # cluster configure center.configure_redis() center.sync_conf(show_result=True) # cluster start center.start_redis_process() center.wait_until_all_redis_process_up() key = 'cluster-node-timeout' m_hosts = center.master_host_list m_ports = center.master_port_list origin_m_value = center.cli_config_get(key, m_hosts[0], m_ports[0]) if not origin_m_value: msg = "RedisConfigKeyError(master): '{}'".format(key) logger.warning(msg) s_hosts = center.slave_host_list s_ports = center.slave_port_list if s_hosts and s_ports: origin_s_value = center.cli_config_get(key, s_hosts[0], s_ports[0]) if not origin_s_value: msg = "RedisConfigKeyError(slave): '{}'".format(key) logger.warning(msg) if origin_m_value: # cli config set cluster-node-timeout 2000 logger.debug('set cluster node time out 2000 for create') center.cli_config_set_all(key, '2000', m_hosts, m_ports) if s_hosts and s_ports and origin_s_value: center.cli_config_set_all(key, '2000', s_hosts, s_ports) center.create_cluster(yes) if origin_m_value: # cli config restore cluster-node-timeout logger.debug('restore cluster node time out') center.cli_config_set_all(key, origin_m_value, m_hosts, m_ports) if s_hosts and s_ports and origin_s_value: v = origin_s_value center.cli_config_set_all(key, v, s_hosts, s_ports)
def failover_list(self): """ Find failovered|no-slave|no-slot masters and failbacked slaves """ center = Center() center.update_ip_port() logger.debug('failover_list') master_nodes = center.get_master_obj_list() slave_nodes = center.get_slave_nodes() master_ports = center.master_port_list slave_ports = center.slave_port_list output_msg = [] failovered_masters = [] for master_node in master_nodes: addr = master_node['addr'] port = addr.split(':')[1] try: value = int(port) if value in slave_ports: failovered_masters.append(addr) except ValueError: pass noslave_masters = [] for master_node in master_nodes: if len(master_node['slaves']) == 0: noslave_masters.append(master_node['addr']) else: for slave_node in master_node['slaves']: if slave_node['status'] == 'disconnected': noslave_masters.append(master_node['addr']) break noslot_masters = [] ret = RedisCliUtil.command_all_async('cluster nodes', slave=True) outs = '' for _, host, port, res, stdout in ret: if res == 'OK': outs = '\n'.join([outs, stdout]) lines = outs.splitlines() filtered_nodes = (filter(lambda x: 'myself,master' in x, lines)) else: logger.warning("FAIL {}:{} {}".format(host, port, stdout)) for line in filtered_nodes: words = line.split() if len(words) == 8: noslot_masters.append(line.split()[1]) failbacked_slaves = [] for slave_nodes in slave_nodes: port = slave_nodes.split(':')[1] try: value = int(port) if value in master_ports: failbacked_slaves.append(slave_nodes) except ValueError: pass output_msg.append('1) failovered masters:') output_msg.extend(failovered_masters) output_msg.append('') output_msg.append('2) no-slave masters:') output_msg.extend(noslave_masters) output_msg.append('') output_msg.append('3) no-slot masters:') output_msg.extend(noslot_masters) output_msg.append('') output_msg.append('4) failbacked slaves:') output_msg.extend(failbacked_slaves) output_msg.append('') logger.info(color.ENDC + '\n'.join(output_msg))