def reset_distribution(self): """ Reset the distribution of masters and slaves with original setting """ center = Center() center.update_ip_port() logger.debug('reset_distribution') cluster_id = config.get_cur_cluster_id() lib_path = config.get_ld_library_path(cluster_id) path_of_fb = config.get_path_of_fb(cluster_id) sr2_redis_bin = path_of_fb['sr2_redis_bin'] env_cmd = [ 'GLOBIGNORE=*;', 'export LD_LIBRARY_PATH={};'.format(lib_path['ld_library_path']), 'export DYLD_LIBRARY_PATH={};'.format( lib_path['dyld_library_path']), ] redis_cli_cmd = os.path.join(sr2_redis_bin, 'redis-cli') slave_nodes = center.get_slave_nodes() master_ports = center.master_port_list for slave_node in slave_nodes: (host, port) = slave_node.split(':') try: value = int(port) if value in master_ports: # failover takeover msg = message.get('try_failover_takeover').format( slave=slave_node) self._print(msg) sub_cmd = 'cluster failover takeover' command = '{} {} -h {} -p {} {}'.format( ' '.join(env_cmd), redis_cli_cmd, host, port, sub_cmd, ) stdout = subprocess.check_output(command, shell=True) outs = '' outs = '\n'.join([outs, stdout]) self._print(outs) except ValueError: pass
def _deploy_zero_downtime(cluster_id): logger.debug("zero downtime update cluster {}".format(cluster_id)) center = Center() center.update_ip_port() m_hosts = center.master_host_list m_ports = center.master_port_list s_hosts = center.slave_host_list s_ports = center.slave_port_list path_of_fb = config.get_path_of_fb(cluster_id) cluster_path = path_of_fb['cluster_path'] # check master alive m_count = len(m_hosts) * len(m_ports) alive_m_count = center.get_alive_master_redis_count() if alive_m_count < m_count: logger.error(message.get('error_exist_disconnected_master')) return if not config.is_slave_enabled: logger.error(message.get('error_need_to_slave')) return # select installer installer_path = ask_util.installer() installer_name = os.path.basename(installer_path) # backup info current_time = time.strftime("%Y%m%d%H%M%S", time.gmtime()) conf_backup_dir = 'cluster_{}_conf_bak_{}'.format(cluster_id, current_time) cluster_backup_dir = 'cluster_{}_bak_{}'.format(cluster_id, current_time) local_ip = config.get_local_ip() # backup conf center.conf_backup(local_ip, cluster_id, conf_backup_dir) # backup cluster for host in s_hosts: client = net.get_ssh(host) center.cluster_backup(host, cluster_id, cluster_backup_dir) client.close() # transfer & install logger.info(message.get('transfer_and_execute_installer')) for host in m_hosts: logger.info(' - {}'.format(host)) client = net.get_ssh(host) cmd = 'mkdir -p {0} && touch {0}/.deploy.state'.format(cluster_path) net.ssh_execute(client=client, command=cmd) client.close() DeployUtil().transfer_installer(host, cluster_id, installer_path) try: DeployUtil().install(host, cluster_id, installer_name) except SSHCommandError as ex: msg = message.get('error_execute_installer') msg = msg.format(installer=installer_path) logger.error(msg) logger.exception(ex) return # restore conf center.conf_restore(local_ip, cluster_id, conf_backup_dir) # set deploy state complete for node in m_hosts: path_of_fb = config.get_path_of_fb(cluster_id) cluster_path = path_of_fb['cluster_path'] client = net.get_ssh(node) cmd = 'rm -rf {}'.format(os.path.join(cluster_path, '.deploy.state')) net.ssh_execute(client=client, command=cmd) client.close() # restart slave center.stop_current_nodes(master=False, slave=True) center.configure_redis() center.sync_conf() center.start_current_nodes(master=False, slave=True) center.wait_until_all_redis_process_up() slaves_for_failover = center.get_slave_nodes() key = 'cluster-node-timeout' origin_m_value = center.cli_config_get(key, m_hosts[0], m_ports[0]) origin_s_value = center.cli_config_get(key, s_hosts[0], s_ports[0]) logger.debug('config set: cluster-node-timeout 2000') RedisCliConfig().set(key, '2000', all=True) # cluster failover (with no option) logger.info(message.get('failover_on_deploy')) logger.debug(slaves_for_failover) try_count = 0 while try_count < 10: try_count += 1 success = True for slave_addr in slaves_for_failover: host, port = slave_addr.split(':') stdout = center.run_failover("{}:{}".format(host, port)) logger.debug("failover {}:{} {}".format(host, port, stdout)) if stdout != "ERR You should send CLUSTER FAILOVER to a slave": # In some cases, the cluster failover is not complete # even if stdout is OK # If redis changed to master completely, # return 'ERR You should send CLUSTER FAILOVER to a slave' success = False if success: break msg = message.get('retry').format(try_count=try_count) logger.info(msg) time.sleep(5) logger.debug('restore config: cluster-node-timeout') center.cli_config_set_all(key, origin_m_value, m_hosts, m_ports) center.cli_config_set_all(key, origin_s_value, s_hosts, s_ports) if not success: logger.error(message.get('error_redis_failover')) return # restart master (current slave) center.stop_current_nodes(master=False, slave=True) center.configure_redis(slave=False) center.sync_conf() center.start_current_nodes(master=False, slave=True) center.wait_until_all_redis_process_up()
def failover_list(self): """ Find failovered|no-slave|no-slot masters and failbacked slaves """ center = Center() center.update_ip_port() logger.debug('failover_list') master_nodes = center.get_master_obj_list() slave_nodes = center.get_slave_nodes() master_ports = center.master_port_list slave_ports = center.slave_port_list output_msg = [] failovered_masters = [] for master_node in master_nodes: addr = master_node['addr'] port = addr.split(':')[1] try: value = int(port) if value in slave_ports: failovered_masters.append(addr) except ValueError: pass noslave_masters = [] for master_node in master_nodes: if len(master_node['slaves']) == 0: noslave_masters.append(master_node['addr']) else: for slave_node in master_node['slaves']: if slave_node['status'] == 'disconnected': noslave_masters.append(master_node['addr']) break noslot_masters = [] ret = RedisCliUtil.command_all_async('cluster nodes', slave=True) outs = '' for _, host, port, res, stdout in ret: if res == 'OK': outs = '\n'.join([outs, stdout]) lines = outs.splitlines() filtered_nodes = (filter(lambda x: 'myself,master' in x, lines)) else: logger.warning("FAIL {}:{} {}".format(host, port, stdout)) for line in filtered_nodes: words = line.split() if len(words) == 8: noslot_masters.append(line.split()[1]) failbacked_slaves = [] for slave_nodes in slave_nodes: port = slave_nodes.split(':')[1] try: value = int(port) if value in master_ports: failbacked_slaves.append(slave_nodes) except ValueError: pass output_msg.append('1) failovered masters:') output_msg.extend(failovered_masters) output_msg.append('') output_msg.append('2) no-slave masters:') output_msg.extend(noslave_masters) output_msg.append('') output_msg.append('3) no-slot masters:') output_msg.extend(noslot_masters) output_msg.append('') output_msg.append('4) failbacked slaves:') output_msg.extend(failbacked_slaves) output_msg.append('') logger.info(color.ENDC + '\n'.join(output_msg))