def test_kill_journal_node(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal-0') name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') sdk_tasks.kill_task_with_pattern('journalnode', sdk_hosts.system_host(FOLDERED_SERVICE_NAME, 'journal-0-node')) config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_kill_data_node(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) data_ids = sdk_tasks.get_task_ids(foldered_name, 'data-0') journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') sdk_cmd.kill_task_with_pattern('datanode', sdk_hosts.system_host(foldered_name, 'data-0-node')) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
def test_kill_data_node(): data_task = sdk_tasks.get_service_tasks(foldered_name, "data-0")[0] journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") sdk_cmd.kill_task_with_pattern("datanode", "nobody", agent_host=data_task.host) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "data", [data_task.id]) sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
def test_kill_all_journalnodes(): journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") for journal_pod in config.get_pod_type_instances("journal", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart {}".format(journal_pod)) config.expect_recovery(service_name=foldered_name) # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def test_kill_data_node(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) data_ids = sdk_tasks.get_task_ids(foldered_name, 'data-0') journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') sdk_cmd.kill_task_with_pattern( 'datanode', sdk_hosts.system_host(foldered_name, 'data-0-node')) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
def test_kill_all_journalnodes(hdfs_server): service_name = hdfs_server["service"]["name"] journal_ids = sdk_tasks.get_task_ids(service_name, "journal") name_ids = sdk_tasks.get_task_ids(service_name, "name") data_ids = sdk_tasks.get_task_ids(service_name, "data") for journal_pod in config.get_pod_type_instances("journal", service_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, "pod restart {}".format(journal_pod)) config.expect_recovery(service_name=service_name) sdk_tasks.check_tasks_updated(service_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(service_name, "name", name_ids) sdk_tasks.check_tasks_not_updated(service_name, "data", data_ids)
def test_kill_all_datanodes(): journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") for data_pod in config.get_pod_type_instances("data", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart {}".format(data_pod)) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
def test_kill_all_datanodes(): journal_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'journal') name_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'name') data_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'data') for data_pod in config.get_pod_type_instances("data", config.FOLDERED_SERVICE_NAME): sdk_cmd.run_cli('hdfs --name={} pod restart {}'.format(config.FOLDERED_SERVICE_NAME, data_pod)) config.expect_recovery(service_name=config.FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(config.FOLDERED_SERVICE_NAME, 'data', data_ids) sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'name', name_ids)
def test_kill_all_journalnodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(sdk_utils.get_foldered_name(config.SERVICE_NAME), 'data') for journal_pod in config.get_pod_type_instances("journal", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(journal_pod)) config.expect_recovery(service_name=foldered_name) # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_kill_all_datanodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') for data_pod in config.get_pod_type_instances("data", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(data_pod)) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
def test_kill_all_journalnodes(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') for journal_pod in config.get_pod_type_instances("journal", FOLDERED_SERVICE_NAME): sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod restart {}'.format(journal_pod)) config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def replace_name_node(index): config.check_healthy(service_name=config.FOLDERED_SERVICE_NAME) name_node_name = 'name-' + str(index) name_id = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, name_node_name) journal_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'data') sdk_cmd.run_cli('hdfs --name={} pod replace {}'.format(config.FOLDERED_SERVICE_NAME, name_node_name)) config.expect_recovery(service_name=config.FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(config.FOLDERED_SERVICE_NAME, name_node_name, name_id) sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'data', data_ids)
def replace_name_node(index): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_healthy(service_name=foldered_name) name_node_name = 'name-' + str(index) name_id = sdk_tasks.get_task_ids(foldered_name, name_node_name) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace {}'.format(name_node_name)) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, name_node_name, name_id) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_kill_all_datanodes(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') for data_pod in config.get_pod_type_instances("data", FOLDERED_SERVICE_NAME): sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod restart {}'.format(data_pod)) config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'data', data_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids)
def test_permanent_and_transient_namenode_failures_1_0(): config.check_healthy(service_name=FOLDERED_SERVICE_NAME) name_0_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name-0') name_1_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name-1') journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod replace name-1') sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod restart name-0') config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'name-0', name_0_ids) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'name-1', name_1_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_permanent_and_transient_namenode_failures_1_0(): config.check_healthy(service_name=foldered_name) name_0_ids = sdk_tasks.get_task_ids(foldered_name, "name-0") name_1_ids = sdk_tasks.get_task_ids(foldered_name, "name-1") journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod replace name-1") sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart name-0") config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "name-0", name_0_ids) sdk_tasks.check_tasks_updated(foldered_name, "name-1", name_1_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def replace_name_node(index): config.check_healthy(service_name=FOLDERED_SERVICE_NAME) name_node_name = 'name-' + str(index) name_id = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, name_node_name) journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod replace {}'.format(name_node_name)) config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, name_node_name, name_id) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_permanent_and_transient_namenode_failures_1_0(): config.check_healthy(service_name=config.FOLDERED_SERVICE_NAME) name_0_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'name-0') name_1_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'name-1') journal_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'data') sdk_cmd.run_cli('hdfs --name={} pod replace name-1'.format(config.FOLDERED_SERVICE_NAME)) sdk_cmd.run_cli('hdfs --name={} pod restart name-0'.format(config.FOLDERED_SERVICE_NAME)) config.expect_recovery(service_name=config.FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(config.FOLDERED_SERVICE_NAME, 'name-0', name_0_ids) sdk_tasks.check_tasks_updated(config.FOLDERED_SERVICE_NAME, 'name-1', name_1_ids) sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_permanent_and_transient_namenode_failures_1_0(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_healthy(service_name=foldered_name) name_0_ids = sdk_tasks.get_task_ids(foldered_name, 'name-0') name_1_ids = sdk_tasks.get_task_ids(foldered_name, 'name-1') journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace name-1') sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart name-0') config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'name-0', name_0_ids) sdk_tasks.check_tasks_updated(foldered_name, 'name-1', name_1_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_permanent_and_transient_namenode_failures_0_1(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_healthy(service_name=foldered_name) name_0_ids = sdk_tasks.get_task_ids(foldered_name, 'name-0') name_1_ids = sdk_tasks.get_task_ids(foldered_name, 'name-1') journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace name-0') sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart name-1') config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'name-0', name_0_ids) sdk_tasks.check_tasks_updated(foldered_name, 'name-1', name_1_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def replace_node(index, type): log.info("Starting to replace {}-{}".format(type, index)) foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_healthy(service_name=foldered_name) node_name = "{node_type}-{node_index}".format(node_type=type, node_index=index) node_id = sdk_tasks.get_task_ids(foldered_name, node_name) other_nodes = dict() for pod_type in config.HDFS_POD_TYPES: if pod_type != type: other_nodes[pod_type] = sdk_tasks.get_task_ids(foldered_name, pod_type) sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace {}'.format(node_name)) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, node_name, node_id) for pod_type in config.HDFS_POD_TYPES: if pod_type != type: sdk_tasks.check_tasks_not_updated(foldered_name, pod_type, other_nodes[pod_type])