def test_custom_decommission(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) marathon_config = sdk_marathon.get_config(foldered_name) marathon_config['env']['WORLD_COUNT'] = '1' sdk_marathon.update_app(foldered_name, marathon_config) sdk_plan.wait_for_completed_plan(foldered_name, 'decommission') decommission_plan = sdk_plan.get_decommission_plan(foldered_name) log.info("decommission plan: {}".format(decommission_plan)) custom_step_name = decommission_plan['phases'][0]['steps'][0]['name'] assert "custom_decomission_step" == custom_step_name # scale back up marathon_config = sdk_marathon.get_config(foldered_name) marathon_config['env']['WORLD_COUNT'] = '2' sdk_marathon.update_app(foldered_name, marathon_config) sdk_plan.wait_for_completed_deployment(foldered_name) # Let's decommission again! marathon_config = sdk_marathon.get_config(foldered_name) marathon_config['env']['WORLD_COUNT'] = '1' sdk_marathon.update_app(foldered_name, marathon_config) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_plan(foldered_name, 'decommission') decommission_plan = sdk_plan.get_decommission_plan(foldered_name) log.info("decommission plan: {}".format(decommission_plan)) custom_step_name = decommission_plan['phases'][0]['steps'][0]['name'] assert "custom_decomission_step" == custom_step_name
def test_modify_app_config_rollback(): app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS" journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") old_config = sdk_marathon.get_config(foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) log.info("marathon config: ") log.info(marathon_config) expiry_ms = int(marathon_config["env"][app_config_field]) log.info("expiry ms: " + str(expiry_ms)) marathon_config["env"][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(marathon_config, timeout=15 * 60) # Wait for journal nodes to be affected by the change sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") log.info("old config: ") log.info(old_config) # Put the old config back (rollback) sdk_marathon.update_app(old_config) # Wait for the journal nodes to return to their old configuration sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) config.check_healthy(service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) assert int(marathon_config["env"][app_config_field]) == expiry_ms # Data tasks should not have been affected sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def test_modify_app_config_rollback(): app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') old_config = marathon.get_config(PACKAGE_NAME) config = marathon.get_config(PACKAGE_NAME) sdk_utils.out('marathon config: ') sdk_utils.out(config) expiry_ms = int(config['env'][app_config_field]) sdk_utils.out('expiry ms: ' + str(expiry_ms)) config['env'][app_config_field] = str(expiry_ms + 1) marathon.update_app(PACKAGE_NAME, config) # Wait for journal nodes to be affected by the change tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') sdk_utils.out('old config: ') sdk_utils.out(old_config) # Put the old config back (rollback) marathon.update_app(PACKAGE_NAME, old_config) # Wait for the journal nodes to return to their old configuration tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) check_healthy() config = marathon.get_config(PACKAGE_NAME) assert int(config['env'][app_config_field]) == expiry_ms # Data tasks should not have been affected tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_modify_app_config_rollback(): app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS' foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') old_config = sdk_marathon.get_config(foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) log.info('expiry ms: ' + str(expiry_ms)) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # Wait for journal nodes to be affected by the change sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') log.info('old config: ') log.info(old_config) # Put the old config back (rollback) sdk_marathon.update_app(foldered_name, old_config) # Wait for the journal nodes to return to their old configuration sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) config.check_healthy(service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) assert int(marathon_config['env'][app_config_field]) == expiry_ms # Data tasks should not have been affected sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_modify_app_config_rollback(): app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') old_config = sdk_marathon.get_config(foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) log.info('expiry ms: ' + str(expiry_ms)) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # Wait for journal nodes to be affected by the change sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') log.info('old config: ') log.info(old_config) # Put the old config back (rollback) sdk_marathon.update_app(foldered_name, old_config) # Wait for the journal nodes to return to their old configuration sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) config.check_healthy(service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) assert int(marathon_config['env'][app_config_field]) == expiry_ms # Data tasks should not have been affected sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_custom_decommission(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) marathon_config = sdk_marathon.get_config(foldered_name) marathon_config["env"]["WORLD_COUNT"] = "1" sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_plan(foldered_name, "decommission") decommission_plan = sdk_plan.get_decommission_plan(foldered_name) log.info(sdk_plan.plan_string("decommission", decommission_plan)) custom_step_name = decommission_plan["phases"][0]["steps"][0]["name"] assert "custom_decommission_step" == custom_step_name # scale back up marathon_config = sdk_marathon.get_config(foldered_name) marathon_config["env"]["WORLD_COUNT"] = "2" sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(foldered_name) # Let's decommission again! marathon_config = sdk_marathon.get_config(foldered_name) marathon_config["env"]["WORLD_COUNT"] = "1" sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_plan(foldered_name, "decommission") decommission_plan = sdk_plan.get_decommission_plan(foldered_name) log.info(sdk_plan.plan_string("decommission", decommission_plan)) custom_step_name = decommission_plan["phases"][0]["steps"][0]["name"] assert "custom_decommission_step" == custom_step_name
def test_plugin_install_and_uninstall(default_populated_index): plugin_name = 'analysis-phonetic' config = marathon.get_config(PACKAGE_NAME) config['env']['ELASTICSEARCH_PLUGINS'] = plugin_name marathon.update_app(PACKAGE_NAME, config) check_plugin_installed(plugin_name) config = marathon.get_config(PACKAGE_NAME) config['env']['ELASTICSEARCH_PLUGINS'] = "" marathon.update_app(PACKAGE_NAME, config) check_plugin_uninstalled(plugin_name)
def test_plugin_install_and_uninstall(default_populated_index): plugin_name = 'analysis-phonetic' config = marathon.get_config(PACKAGE_NAME) config['env']['ELASTICSEARCH_PLUGINS'] = plugin_name marathon_update(config) check_plugin_installed(plugin_name) config = marathon.get_config(PACKAGE_NAME) config['env']['ELASTICSEARCH_PLUGINS'] = "" marathon_update(config) check_plugin_uninstalled(plugin_name)
def test_plugin_install_and_uninstall(default_populated_index): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) plugin_name = 'analysis-phonetic' marathon_config = sdk_marathon.get_config(foldered_name) marathon_config['env']['TASKCFG_ALL_ELASTICSEARCH_PLUGINS'] = plugin_name sdk_marathon.update_app(foldered_name, marathon_config) config.check_plugin_installed(plugin_name, service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) marathon_config['env']['TASKCFG_ALL_ELASTICSEARCH_PLUGINS'] = "" sdk_marathon.update_app(foldered_name, marathon_config) config.check_plugin_uninstalled(plugin_name, service_name=foldered_name)
def test_plugin_install_and_uninstall(default_populated_index): plugin_name = 'analysis-phonetic' marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME) marathon_config['env']['TASKCFG_ALL_ELASTICSEARCH_PLUGINS'] = plugin_name sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config) config.check_plugin_installed(plugin_name, service_name=FOLDERED_SERVICE_NAME) marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME) marathon_config['env']['TASKCFG_ALL_ELASTICSEARCH_PLUGINS'] = "" sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config) config.check_plugin_uninstalled(plugin_name, service_name=FOLDERED_SERVICE_NAME)
def test_plugin_install_and_uninstall(default_populated_index): plugin_name = 'analysis-phonetic' marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME) marathon_config['env']['TASKCFG_ALL_ELASTICSEARCH_PLUGINS'] = plugin_name sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config) config.check_plugin_installed( plugin_name, service_name=FOLDERED_SERVICE_NAME) marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME) marathon_config['env']['TASKCFG_ALL_ELASTICSEARCH_PLUGINS'] = "" sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config) config.check_plugin_uninstalled( plugin_name, service_name=FOLDERED_SERVICE_NAME)
def test_state_refresh_disable_cache(): '''Disables caching via a scheduler envvar''' foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) task_ids = sdk_tasks.get_task_ids(foldered_name, '') # caching enabled by default: stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug state refresh_cache') assert "Received cmd: refresh" in stdout marathon_config = sdk_marathon.get_config(foldered_name) marathon_config['env']['DISABLE_STATE_CACHE'] = 'any-text-here' sdk_marathon.update_app(foldered_name, marathon_config) sdk_tasks.check_tasks_not_updated(foldered_name, '', task_ids) config.check_running(foldered_name) # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up): @retrying.retry(wait_fixed=1000, stop_max_delay=120 * 1000, retry_on_result=lambda res: not res) def check_cache_refresh_fails_409conflict(): output = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug state refresh_cache', return_stderr_in_stdout=True) return "failed: 409 Conflict" in output check_cache_refresh_fails_409conflict() marathon_config = sdk_marathon.get_config(foldered_name) del marathon_config['env']['DISABLE_STATE_CACHE'] sdk_marathon.update_app(foldered_name, marathon_config) sdk_tasks.check_tasks_not_updated(foldered_name, '', task_ids) config.check_running(foldered_name) shakedown.deployment_wait( ) # ensure marathon thinks the deployment is complete too # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up): @retrying.retry(wait_fixed=1000, stop_max_delay=120 * 1000, retry_on_result=lambda res: not res) def check_cache_refresh(): return sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug state refresh_cache') stdout = check_cache_refresh() assert "Received cmd: refresh" in stdout
def test_state_refresh_disable_cache(): '''Disables caching via a scheduler envvar''' foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) task_ids = sdk_tasks.get_task_ids(foldered_name, '') # caching enabled by default: stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug state refresh_cache') assert "Received cmd: refresh" in stdout marathon_config = sdk_marathon.get_config(foldered_name) marathon_config['env']['DISABLE_STATE_CACHE'] = 'any-text-here' sdk_marathon.update_app(foldered_name, marathon_config) sdk_tasks.check_tasks_not_updated(foldered_name, '', task_ids) config.check_running(foldered_name) # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up): @retrying.retry( wait_fixed=1000, stop_max_delay=120*1000, retry_on_result=lambda res: not res) def check_cache_refresh_fails_409conflict(): output = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, 'debug state refresh_cache', return_stderr_in_stdout=True) return "failed: 409 Conflict" in output check_cache_refresh_fails_409conflict() marathon_config = sdk_marathon.get_config(foldered_name) del marathon_config['env']['DISABLE_STATE_CACHE'] sdk_marathon.update_app(foldered_name, marathon_config) sdk_tasks.check_tasks_not_updated(foldered_name, '', task_ids) config.check_running(foldered_name) shakedown.deployment_wait() # ensure marathon thinks the deployment is complete too # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up): @retrying.retry( wait_fixed=1000, stop_max_delay=120*1000, retry_on_result=lambda res: not res) def check_cache_refresh(): return sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug state refresh_cache') stdout = check_cache_refresh() assert "Received cmd: refresh" in stdout
def test_state_refresh_disable_cache(): """Disables caching via a scheduler envvar""" foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) task_ids = sdk_tasks.get_task_ids(foldered_name, "") # caching enabled by default: rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "debug state refresh_cache") assert rc == 0, "Refresh cache failed" assert "Received cmd: refresh" in stdout marathon_config = sdk_marathon.get_config(foldered_name) marathon_config["env"]["DISABLE_STATE_CACHE"] = "any-text-here" sdk_marathon.update_app(marathon_config) sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids) config.check_running(foldered_name) # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up): @retrying.retry(wait_fixed=1000, stop_max_delay=120 * 1000, retry_on_result=lambda res: not res) def check_cache_refresh_fails_409conflict(): rc, stdout, stderr = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "debug state refresh_cache") return rc != 0 and stdout == "" and "failed: 409 Conflict" in stderr check_cache_refresh_fails_409conflict() marathon_config = sdk_marathon.get_config(foldered_name) del marathon_config["env"]["DISABLE_STATE_CACHE"] sdk_marathon.update_app(marathon_config) sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids) config.check_running(foldered_name) # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up): @retrying.retry(wait_fixed=1000, stop_max_delay=120 * 1000, retry_on_result=lambda res: not res) def check_cache_refresh(): rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "debug state refresh_cache") assert rc == 0, "Refresh cache failed" return stdout stdout = check_cache_refresh() assert "Received cmd: refresh" in stdout
def test_state_refresh_disable_cache(): '''Disables caching via a scheduler envvar''' foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) task_ids = sdk_tasks.get_task_ids(foldered_name, '') # caching enabled by default: stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'state refresh_cache') assert "Received cmd: refresh" in stdout marathon_config = sdk_marathon.get_config(foldered_name) marathon_config['env']['DISABLE_STATE_CACHE'] = 'any-text-here' sdk_marathon.update_app(foldered_name, marathon_config) sdk_tasks.check_tasks_not_updated(foldered_name, '', task_ids) config.check_running(foldered_name) # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up): def check_cache_refresh_fails_409conflict(): try: sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'state refresh_cache') except Exception as e: if "failed: 409 Conflict" in e.args[0]: return True return False shakedown.wait_for(lambda: check_cache_refresh_fails_409conflict(), timeout_seconds=120.) marathon_config = sdk_marathon.get_config(foldered_name) del marathon_config['env']['DISABLE_STATE_CACHE'] sdk_marathon.update_app(foldered_name, marathon_config) sdk_tasks.check_tasks_not_updated(foldered_name, '', task_ids) config.check_running(foldered_name) shakedown.deployment_wait( ) # ensure marathon thinks the deployment is complete too # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up): def check_cache_refresh(): return sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'state refresh_cache') stdout = shakedown.wait_for(lambda: check_cache_refresh(), timeout_seconds=120.) assert "Received cmd: refresh" in stdout
def test_deploy(): wait_time_in_seconds = 600 sdk_plan.wait_for_kicked_off_deployment(config.SERVICE_NAME) # taskcfg.yml will initially fail to deploy because several options are missing in the default # sdk_marathon.json.mustache. verify that the tasks are failing before continuing. task_name = 'hello-0-server' log.info('Checking that {} is failing to launch within {}s'.format(task_name, wait_time_in_seconds)) original_state_history = _get_state_history(task_name) # wait for new TASK_FAILEDs to appear: @retrying.retry( wait_fixed=1000, stop_max_delay=1000 * wait_time_in_seconds, retry_on_result=lambda res: not res) def wait_for_new_failures(): new_state_history = _get_state_history(task_name) assert len(new_state_history) >= len(original_state_history) added_state_history = new_state_history[len(original_state_history) :] log.info("Added {} state history: {}".format(task_name, ", ".join(added_state_history))) return "TASK_FAILED" in added_state_history wait_for_new_failures() # add the needed envvars in marathon and confirm that the deployment succeeds: marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) env = marathon_config["env"] del env["SLEEP_DURATION"] env["TASKCFG_ALL_OUTPUT_FILENAME"] = "output" env["TASKCFG_ALL_SLEEP_DURATION"] = "1000" sdk_marathon.update_app(marathon_config) config.check_running()
def test_port_static_to_dynamic_port(): sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) broker_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE)) marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env']['BROKER_PORT'] = '0' sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids) # all tasks are running sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) for broker_id in range(config.DEFAULT_BROKER_COUNT): result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'broker get {}'.format(broker_id), json=True) assert result['port'] != 9092 result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints broker', json=True) assert len(result['address']) == config.DEFAULT_BROKER_COUNT assert len(result['dns']) == config.DEFAULT_BROKER_COUNT for port in result['address']: assert int(port.split(':')[-1]) != 9092 for port in result['dns']: assert int(port.split(':')[-1]) != 9092
def test_node_replace_replaces_node(): replace_task = [ task for task in sdk_tasks.get_summary() if task.name == "node-2-server" ][0] log.info("avoid host for task {}".format(replace_task)) replace_pod_name = replace_task.name[:-len("-server")] # Update the placement constraints so the new node doesn't end up on the same host marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) original_constraint = marathon_config["env"]["PLACEMENT_CONSTRAINT"] try: marathon_config["env"][ "PLACEMENT_CONSTRAINT"] = '[["hostname", "UNLIKE", "{}"]]'.format( replace_task.host) sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery( config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS) finally: # revert to prior placement setting before proceeding with tests: avoid getting stuck. marathon_config["env"]["PLACEMENT_CONSTRAINT"] = original_constraint sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
def setup_constraint_switch(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) agents = shakedown.get_private_agents() some_agent = agents[0] other_agent = agents[1] log.info('Agents: %s %s', some_agent, other_agent) assert some_agent != other_agent options = _escape_placement_for_1_9({ "service": { "yaml": "marathon_constraint" }, "hello": { "count": 1, # First, we stick the pod to some_agent "placement": "[[\"hostname\", \"LIKE\", \"{}\"]]".format(some_agent) }, "world": { "count": 0 } }) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, 1, additional_options=options) sdk_tasks.check_running(config.SERVICE_NAME, 1) hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello') # Now, stick it to other_agent marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env']['HELLO_PLACEMENT'] = "[[\"hostname\", \"LIKE\", \"{}\"]]".format(other_agent) sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) # Wait for the scheduler to be up and settled before advancing. sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) return some_agent, other_agent, hello_ids
def test_custom_zookeeper(): broker_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE)) # sanity check: brokers should be reinitialized: brokers = service_cli('broker list', service_name=FOLDERED_SERVICE_NAME) assert set(brokers) == set([str(i) for i in range(DEFAULT_BROKER_COUNT)]) # create a topic against the default zk: service_cli('topic create {}'.format(DEFAULT_TOPIC_NAME), service_name=FOLDERED_SERVICE_NAME) assert service_cli('topic list', service_name=FOLDERED_SERVICE_NAME) == [ DEFAULT_TOPIC_NAME ] config = marathon.get_config(FOLDERED_SERVICE_NAME) # should be using default path when this envvar is empty/unset: assert config['env']['KAFKA_ZOOKEEPER_URI'] == '' # use a custom zk path that's WITHIN the 'dcos-service-' path, so that it's automatically cleaned up in uninstall: zk_path = 'master.mesos:2181/dcos-service-test__integration__kafka/CUSTOMPATH' config['env']['KAFKA_ZOOKEEPER_URI'] = zk_path marathon.update_app(FOLDERED_SERVICE_NAME, config) tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE), broker_ids) plan.wait_for_completed_deployment(FOLDERED_SERVICE_NAME) zookeeper = service_cli('endpoints zookeeper', get_json=False, service_name=FOLDERED_SERVICE_NAME) assert zookeeper.rstrip('\n') == zk_path # topic created earlier against default zk should no longer be present: assert service_cli('topic list', service_name=FOLDERED_SERVICE_NAME) == []
def test_deploy(): wait_time = 30 # taskcfg.yml will initially fail to deploy because several options are missing in the default # sdk_marathon.json.mustache. verify that the tasks are failing before continuing. task_name = 'hello-0-server' log.info('Checking that {} is failing to launch within {}s'.format(task_name, wait_time)) original_statuses = sdk_tasks.get_status_history(task_name) # wait for new TASK_FAILEDs to appear: @retrying.retry( wait_fixed=1000, stop_max_delay=1000*wait_time, retry_on_result=lambda res: not res) def wait_for_new_failures(): new_statuses = sdk_tasks.get_status_history(task_name) assert len(new_statuses) >= len(original_statuses) added_statuses = new_statuses[len(original_statuses):] log.info('New {} statuses: {}'.format(task_name, ', '.join(added_statuses))) return 'TASK_FAILED' in added_statuses wait_for_new_failures() # add the needed envvars in marathon and confirm that the deployment succeeds: marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) env = marathon_config['env'] del env['SLEEP_DURATION'] env['TASKCFG_ALL_OUTPUT_FILENAME'] = 'output' env['TASKCFG_ALL_SLEEP_DURATION'] = '1000' sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) config.check_running()
def test_custom_zookeeper(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) broker_ids = sdk_tasks.get_task_ids(foldered_name, '{}-'.format(config.DEFAULT_POD_TYPE)) # create a topic against the default zk: test_utils.create_topic(config.DEFAULT_TOPIC_NAME, service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) # should be using default path when this envvar is empty/unset: assert marathon_config['env']['KAFKA_ZOOKEEPER_URI'] == '' # use a custom zk path that's WITHIN the 'dcos-service-' path, so that it's automatically cleaned up in uninstall: zk_path = 'master.mesos:2181/{}/CUSTOMPATH'.format(sdk_utils.get_zk_path(foldered_name)) marathon_config['env']['KAFKA_ZOOKEEPER_URI'] = zk_path sdk_marathon.update_app(foldered_name, marathon_config) sdk_tasks.check_tasks_updated(foldered_name, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids) sdk_plan.wait_for_completed_deployment(foldered_name) # wait for brokers to finish registering test_utils.broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=foldered_name) zookeeper = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'endpoints zookeeper') assert zookeeper.rstrip('\n') == zk_path # topic created earlier against default zk should no longer be present: topic_list_info = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'topic list', json=True) test_utils.assert_topic_lists_are_equal_without_automatic_topics([], topic_list_info)
def test_custom_zookeeper(): broker_ids = sdk_tasks.get_task_ids( FOLDERED_SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE)) # create a topic against the default zk: sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic create {}'.format(config.DEFAULT_TOPIC_NAME), json=True) assert sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic list', json=True) == [config.DEFAULT_TOPIC_NAME] marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME) # should be using default path when this envvar is empty/unset: assert marathon_config['env']['KAFKA_ZOOKEEPER_URI'] == '' # use a custom zk path that's WITHIN the 'dcos-service-' path, so that it's automatically cleaned up in uninstall: zk_path = 'master.mesos:2181/{}/CUSTOMPATH'.format(ZK_SERVICE_PATH) marathon_config['env']['KAFKA_ZOOKEEPER_URI'] = zk_path sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config) sdk_tasks.check_tasks_updated( FOLDERED_SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids) sdk_plan.wait_for_completed_deployment(FOLDERED_SERVICE_NAME) # wait for brokers to finish registering test_utils.broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=FOLDERED_SERVICE_NAME) zookeeper = sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'endpoints zookeeper') assert zookeeper.rstrip('\n') == zk_path # topic created earlier against default zk should no longer be present: assert sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic list', json=True) == []
def test_deploy(): wait_time = 30 # taskcfg.yml will initially fail to deploy because several options are missing in the default # marathon.json.mustache. verify that tasks are failing for 30s before continuing. print('Checking that tasks are failing to launch for at least {}s'.format(wait_time)) # we can get brief blips of TASK_RUNNING but they shouldnt last more than 2-3s: consecutive_task_running = 0 def fn(): nonlocal consecutive_task_running svc_tasks = shakedown.get_service_tasks(PACKAGE_NAME) states = [t['state'] for t in svc_tasks] print('Task states: {}'.format(states)) if 'TASK_RUNNING' in states: consecutive_task_running += 1 assert consecutive_task_running <= 3 else: consecutive_task_running = 0 return False try: spin.time_wait_noisy(lambda: fn(), timeout_seconds=wait_time) except shakedown.TimeoutExpired: print('Timeout reached as expected') # add the needed envvars in marathon and confirm that the deployment succeeds: config = marathon.get_config(PACKAGE_NAME) env = config['env'] del env['SLEEP_DURATION'] env['TASKCFG_ALL_OUTPUT_FILENAME'] = 'output' env['TASKCFG_ALL_SLEEP_DURATION'] = '1000' marathon.update_app(PACKAGE_NAME, config) check_running()
def test_no_change(): broker_ids = tasks.get_task_ids(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE)) plan1 = service_cli('plan show {}'.format(DEFAULT_PLAN_NAME)) config = marathon.get_config(SERVICE_NAME) marathon.update_app(SERVICE_NAME, config) plan2 = service_cli('plan show {}'.format(DEFAULT_PLAN_NAME)) assert plan1 == plan2 try: tasks.check_tasks_updated(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE), broker_ids, timeout_seconds=60) assert False, "Should not restart tasks now" except AssertionError as arg: raise arg except: pass tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) assert plan2['status'] == 'COMPLETE' assert plan2['phases'][0]['status'] == 'COMPLETE' for step in range(DEFAULT_BROKER_COUNT): assert plan2['phases'][0]['steps'][step]['status'] == 'COMPLETE'
def setup_constraint_switch(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) agents = sdk_agents.get_private_agents() some_agent = agents[0]["hostname"] other_agent = agents[1]["hostname"] log.info("Agents: %s %s", some_agent, other_agent) assert some_agent != other_agent options = _escape_placement_for_1_9( { "service": {"yaml": "marathon_constraint"}, "hello": { "count": 1, # First, we stick the pod to some_agent "placement": '[["hostname", "LIKE", "{}"]]'.format(some_agent), }, "world": {"count": 0}, } ) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, 1, additional_options=options) sdk_tasks.check_running(config.SERVICE_NAME, 1) hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello") # Now, stick it to other_agent marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config["env"]["HELLO_PLACEMENT"] = '[["hostname", "LIKE", "{}"]]'.format(other_agent) sdk_marathon.update_app(marathon_config) # Wait for the scheduler to be up and settled before advancing. sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) return some_agent, other_agent, hello_ids
def test_modify_app_config(): """This tests checks that the modification of the app config does not trigger a recovery.""" foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(foldered_name) old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # All tasks should be updated because hdfs-site.xml has changed config.check_healthy(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_updated(foldered_name, 'name', name_ids) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_plan.wait_for_completed_recovery(foldered_name) new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") assert old_recovery_plan == new_recovery_plan
def test_port_static_to_dynamic_port(): sdk_tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) broker_ids = sdk_tasks.get_task_ids(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE)) config = sdk_marathon.get_config(SERVICE_NAME) config['env']['BROKER_PORT'] = '0' sdk_marathon.update_app(SERVICE_NAME, config) sdk_tasks.check_tasks_updated(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE), broker_ids) # all tasks are running sdk_tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) for broker_id in range(DEFAULT_BROKER_COUNT): result = service_cli('broker get {}'.format(broker_id)) assert result['port'] != 9092 result = service_cli('endpoints broker') assert len(result['address']) == DEFAULT_BROKER_COUNT assert len(result['dns']) == DEFAULT_BROKER_COUNT for port in result['address']: assert int(port.split(':')[-1]) != 9092 for port in result['dns']: assert int(port.split(':')[-1]) != 9092
def test_node_replace_replaces_node(): replace_task = [ task for task in sdk_tasks.get_summary() if task.name == 'node-2-server'][0] log.info('avoid host for task {}'.format(replace_task)) replace_pod_name = replace_task.name[:-len('-server')] # Update the placement constraints so the new node doesn't end up on the same host marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) original_constraint = marathon_config['env']['PLACEMENT_CONSTRAINT'] try: marathon_config['env']['PLACEMENT_CONSTRAINT'] = '[["hostname", "UNLIKE", "{}"]]'.format(replace_task.host) sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS) finally: # revert to prior placement setting before proceeding with tests: avoid getting stuck. marathon_config['env']['PLACEMENT_CONSTRAINT'] = original_constraint sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
def test_add_pods_post_update(): # Add new pods to service which should be launched with the new role. marathon_config = sdk_marathon.get_config(SERVICE_NAME) # Add an extra pod to each. marathon_config["env"]["HELLO_COUNT"] = "2" marathon_config["env"]["WORLD_COUNT"] = "3" # Update the app sdk_marathon.update_app(marathon_config) # Wait for scheduler to restart. sdk_plan.wait_for_completed_deployment(SERVICE_NAME) # Get the current service state to verify roles have applied. service_roles = sdk_utils.get_service_roles(SERVICE_NAME) current_task_roles = service_roles["task-roles"] # We must have some role! assert len(current_task_roles) > 0 assert len(current_task_roles) == 5 assert LEGACY_ROLE in current_task_roles.values() assert ENFORCED_ROLE not in current_task_roles.values() assert service_roles["framework-roles"] is None assert service_roles["framework-role"] == LEGACY_ROLE
def test_port_static_to_static_port(): tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) broker_ids = tasks.get_task_ids(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE)) config = marathon.get_config(SERVICE_NAME) utils.out('Old Config :{}'.format(config)) for broker_id in range(DEFAULT_BROKER_COUNT): result = service_cli('broker get {}'.format(broker_id)) assert result['port'] == 9092 config['env']['BROKER_PORT'] = '9095' marathon.update_app(SERVICE_NAME, config) utils.out('New Config :{}'.format(config)) tasks.check_tasks_updated(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE), broker_ids) # all tasks are running tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) for broker_id in range(DEFAULT_BROKER_COUNT): result = service_cli('broker get {}'.format(broker_id)) assert result['port'] == 9095
def test_modify_app_config(): """This tests checks that the modification of the app config does not trigger a recovery.""" foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(foldered_name) old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS' journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # All tasks should be updated because hdfs-site.xml has changed config.check_healthy(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_updated(foldered_name, 'name', name_ids) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_plan.wait_for_completed_recovery(foldered_name) new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") assert old_recovery_plan == new_recovery_plan
def test_modify_app_config(): """This tests checks that the modification of the app config does not trigger a recovery.""" sdk_plan.wait_for_completed_recovery(foldered_name) old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS" journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") marathon_config = sdk_marathon.get_config(foldered_name) log.info("marathon config: ") log.info(marathon_config) expiry_ms = int(marathon_config["env"][app_config_field]) marathon_config["env"][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(marathon_config, timeout=15 * 60) # All tasks should be updated because hdfs-site.xml has changed config.check_healthy(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_updated(foldered_name, "name", name_ids) sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids) sdk_plan.wait_for_completed_recovery(foldered_name) new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") assert old_recovery_plan == new_recovery_plan
def test_modify_app_config(): check_healthy() app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name') zkfc_ids = tasks.get_task_ids(PACKAGE_NAME, 'zkfc') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') print('journal ids: ' + str(journal_ids)) print('name ids: ' + str(name_ids)) print('zkfc ids: ' + str(zkfc_ids)) print('data ids: ' + str(data_ids)) config = marathon.get_config(PACKAGE_NAME) print('marathon config: ') print(config) expiry_ms = int(config['env'][app_config_field]) config['env'][app_config_field] = str(expiry_ms + 1) r = cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) # All tasks should be updated because hdfs-site.xml has changed tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'name', name_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'zkfc', zkfc_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'data', journal_ids) check_healthy()
def test_custom_zookeeper(kafka_client: client.KafkaClient): broker_ids = sdk_tasks.get_task_ids(FOLDERED_NAME, "{}-".format(config.DEFAULT_POD_TYPE)) # create a topic against the default zk: kafka_client.create_topic(config.DEFAULT_TOPIC_NAME) marathon_config = sdk_marathon.get_config(FOLDERED_NAME) # should be using default path when this envvar is empty/unset: assert marathon_config["env"]["KAFKA_ZOOKEEPER_URI"] == "" # use a custom zk path that's WITHIN the 'dcos-service-' path, so that it's automatically cleaned up in uninstall: zk_path = "master.mesos:2181/{}/CUSTOMPATH".format( sdk_utils.get_zk_path(FOLDERED_NAME)) marathon_config["env"]["KAFKA_ZOOKEEPER_URI"] = zk_path sdk_marathon.update_app(marathon_config) sdk_tasks.check_tasks_updated(FOLDERED_NAME, "{}-".format(config.DEFAULT_POD_TYPE), broker_ids) sdk_plan.wait_for_completed_deployment(FOLDERED_NAME) # wait for brokers to finish registering kafka_client.check_broker_count(config.DEFAULT_BROKER_COUNT) zookeeper = sdk_networks.get_endpoint_string(config.PACKAGE_NAME, FOLDERED_NAME, "zookeeper") assert zookeeper == zk_path # topic created earlier against default zk should no longer be present: rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_NAME, "topic list") assert rc == 0, "Topic list command failed" assert config.DEFAULT_TOPIC_NAME not in json.loads(stdout)
def test_disable_quota_role(): # Add new pods to service which should be launched with the new role. marathon_config = sdk_marathon.get_config(SERVICE_NAME) # Turn off legacy role. marathon_config["env"]["ENABLE_ROLE_MIGRATION"] = "false" # Update the app sdk_marathon.update_app(marathon_config) # Wait for scheduler to restart. sdk_plan.wait_for_completed_deployment(SERVICE_NAME) # Get the current service state to verify roles have applied. service_roles = sdk_utils.get_service_roles(SERVICE_NAME) current_task_roles = service_roles["task-roles"] # We must have some role! assert len(current_task_roles) > 0 assert len(current_task_roles) == 3 assert LEGACY_ROLE in current_task_roles.values() assert ENFORCED_ROLE not in current_task_roles.values() # Ensure we're not MULTI_ROLE, and only using the legacy-role. assert service_roles["framework-roles"] is None assert service_roles["framework-role"] == LEGACY_ROLE
def test_deploy(): wait_time = 30 # taskcfg.yml will initially fail to deploy because several options are missing in the default # sdk_marathon.json.mustache. verify that the tasks are failing before continuing. task_name = "hello-0-server" log.info("Checking that {} is failing to launch within {}s".format(task_name, wait_time)) original_state_history = _get_state_history(task_name) # wait for new TASK_FAILEDs to appear: @retrying.retry( wait_fixed=1000, stop_max_delay=1000 * wait_time, retry_on_result=lambda res: not res ) def wait_for_new_failures(): new_state_history = _get_state_history(task_name) assert len(new_state_history) >= len(original_state_history) added_state_history = new_state_history[len(original_state_history) :] log.info("Added {} state history: {}".format(task_name, ", ".join(added_state_history))) return "TASK_FAILED" in added_state_history wait_for_new_failures() # add the needed envvars in marathon and confirm that the deployment succeeds: marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) env = marathon_config["env"] del env["SLEEP_DURATION"] env["TASKCFG_ALL_OUTPUT_FILENAME"] = "output" env["TASKCFG_ALL_SLEEP_DURATION"] = "1000" sdk_marathon.update_app(marathon_config) config.check_running()
def setup_constraint_switch(): install.uninstall(PACKAGE_NAME) agents = shakedown.get_private_agents() some_agent = agents[0] other_agent = agents[1] print("agents", some_agent, other_agent) assert some_agent != other_agent options = { "service": { "spec_file": "examples/marathon_constraint.yml" }, "hello": { "count": 1, # First, we stick the pod to some_agent "placement": 'hostname:LIKE:{}'.format(some_agent) }, "world": { "count": 0 } } install.install(PACKAGE_NAME, 1, additional_options=options) tasks.check_running(PACKAGE_NAME, 1) hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello') # Now, stick it to other_agent config = marathon.get_config(PACKAGE_NAME) config['env']['HELLO_PLACEMENT'] = 'hostname:LIKE:{}'.format(other_agent) marathon.update_app(PACKAGE_NAME, config) # Wait for the scheduler to be up and settled before advancing. plan.wait_for_completed_deployment(PACKAGE_NAME) return some_agent, other_agent, hello_ids
def test_config_update_while_partitioned(): world_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "world") partition_host = world_tasks[0].host sdk_agents.partition_agent(partition_host) service_config = sdk_marathon.get_config(config.SERVICE_NAME) updated_cpus = float(service_config["env"]["WORLD_CPUS"]) + 0.1 service_config["env"]["WORLD_CPUS"] = str(updated_cpus) sdk_marathon.update_app(service_config, wait_for_completed_deployment=False) sdk_agents.reconnect_agent(partition_host) # check that ALL the world tasks are updated after the agent reconnects: sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "world", [t.id for t in world_tasks]) check_healthy() all_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME) running_tasks = [ t for t in all_tasks if t.name.startswith("world") and t.state == "TASK_RUNNING" ] assert len(running_tasks) == config.world_task_count(config.SERVICE_NAME) for t in running_tasks: assert config.close_enough(t.resources["cpus"], updated_cpus)
def test_custom_seccomp_profile(): sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) # uname will now be dissalowed and svc should crashloop marathon_config["env"]["HELLO_SECCOMP_PROFILE_NAME"] = "test_profile.json" sdk_marathon.update_app(marathon_config) sdk_marathon.wait_for_deployment(config.SERVICE_NAME, 60, None)
def change_region_config(region_name): service_config = sdk_marathon.get_config(config.SERVICE_NAME) if region_name is None: del service_config['env']['SERVICE_REGION'] else: service_config['env']['SERVICE_REGION'] = region_name sdk_marathon.update_app(config.SERVICE_NAME, service_config, wait_for_completed_deployment=False)
def test_state_refresh_disable_cache(): """Disables caching via a scheduler envvar""" config.check_running(foldered_name) task_ids = sdk_tasks.get_task_ids(foldered_name, "") # caching enabled by default: rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "debug state refresh_cache") assert rc == 0, "Refresh cache failed" assert "Received cmd: refresh" in stdout marathon_config = sdk_marathon.get_config(foldered_name) marathon_config["env"]["DISABLE_STATE_CACHE"] = "any-text-here" sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids) # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up): @retrying.retry(wait_fixed=1000, stop_max_delay=120 * 1000, retry_on_result=lambda res: not res) def check_cache_refresh_fails_409conflict(): rc, stdout, stderr = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, "debug state refresh_cache" ) return rc != 0 and stdout == "" and "failed: 409 Conflict" in stderr check_cache_refresh_fails_409conflict() marathon_config = sdk_marathon.get_config(foldered_name) del marathon_config["env"]["DISABLE_STATE_CACHE"] sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids) # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up): @retrying.retry(wait_fixed=1000, stop_max_delay=120 * 1000, retry_on_result=lambda res: not res) def check_cache_refresh(): rc, stdout, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, "debug state refresh_cache" ) assert rc == 0, "Refresh cache failed" return stdout stdout = check_cache_refresh() assert "Received cmd: refresh" in stdout
def test_uninstall(): config.check_running() # add the needed envvar in marathon and confirm that the uninstall "deployment" succeeds: marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) env = marathon_config['env'] env['SDK_UNINSTALL'] = 'w00t' sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, 0)
def test_uninstall(): config.check_running() # add the needed envvar in marathon and confirm that the uninstall "deployment" succeeds: marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) env = marathon_config["env"] env["SDK_UNINSTALL"] = "w00t" sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, 0, allow_more=False)
def test_bump_node_counts(): # Run this test last, as it changes the task count config = marathon.get_config(PACKAGE_NAME) data_nodes = int(config['env']['DATA_NODE_COUNT']) config['env']['DATA_NODE_COUNT'] = str(data_nodes + 1) ingest_nodes = int(config['env']['INGEST_NODE_COUNT']) config['env']['INGEST_NODE_COUNT'] = str(ingest_nodes + 1) coordinator_nodes = int(config['env']['COORDINATOR_NODE_COUNT']) config['env']['COORDINATOR_NODE_COUNT'] = str(coordinator_nodes + 1) marathon.update_app(PACKAGE_NAME, config) tasks.check_running(PACKAGE_NAME, DEFAULT_TASK_COUNT + 3)
def test_state_refresh_disable_cache(): '''Disables caching via a scheduler envvar''' config.check_running(FOLDERED_SERVICE_NAME) task_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, '') # caching enabled by default: stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'state refresh_cache') assert "Received cmd: refresh" in stdout marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME) marathon_config['env']['DISABLE_STATE_CACHE'] = 'any-text-here' sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, '', task_ids) config.check_running(FOLDERED_SERVICE_NAME) # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up): def check_cache_refresh_fails_409conflict(): try: sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'state refresh_cache') except Exception as e: if "failed: 409 Conflict" in e.args[0]: return True return False shakedown.wait_for(lambda: check_cache_refresh_fails_409conflict(), timeout_seconds=120.) marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME) del marathon_config['env']['DISABLE_STATE_CACHE'] sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, '', task_ids) config.check_running(FOLDERED_SERVICE_NAME) shakedown.deployment_wait() # ensure marathon thinks the deployment is complete too # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up): def check_cache_refresh(): return sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'state refresh_cache') stdout = shakedown.wait_for(lambda: check_cache_refresh(), timeout_seconds=120.) assert "Received cmd: refresh" in stdout
def test_changing_discovery_replaces_certificate_sans(hello_world_service): """ Update service configuration to change discovery prefix of a task. Scheduler should update task and new SANs should be generated. """ original_tasks = sdk_tasks.get_task_ids(config.PACKAGE_NAME, 'discovery') assert len(original_tasks) == 1, 'Expecting exactly one task ID' task_id = original_tasks[0] assert task_id # Load end-entity certificate from PEM encoded file end_entity_cert = x509.load_pem_x509_certificate( task_exec(task_id, 'cat server.crt').encode('ascii'), DEFAULT_BACKEND) san_extension = end_entity_cert.extensions.get_extension_for_oid( ExtensionOID.SUBJECT_ALTERNATIVE_NAME) sans = [ san.value for san in san_extension.value._general_names._general_names] expected_san = ( '{name}-0.{service_name}.autoip.dcos.thisdcos.directory'.format( name=DISCOVERY_TASK_PREFIX, service_name=config.SERVICE_NAME) ) assert expected_san in sans # Run task update with new discovery prefix marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env']['DISCOVERY_TASK_PREFIX'] = DISCOVERY_TASK_PREFIX + '-new' sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'discovery', original_tasks) sdk_tasks.check_running(config.SERVICE_NAME, 4) new_task_id = sdk_tasks.get_task_ids(config.SERVICE_NAME, "discovery")[0] assert task_id != new_task_id new_cert = x509.load_pem_x509_certificate( task_exec(new_task_id, 'cat server.crt').encode('ascii'), DEFAULT_BACKEND) san_extension = new_cert.extensions.get_extension_for_oid( ExtensionOID.SUBJECT_ALTERNATIVE_NAME) sans = [ san.value for san in san_extension.value._general_names._general_names] expected_san = ( '{name}-0.{service_name}.autoip.dcos.thisdcos.directory'.format( name=DISCOVERY_TASK_PREFIX + '-new', service_name=config.SERVICE_NAME) ) assert expected_san in sans
def test_port_dynamic_to_dynamic_port(): tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) broker_ids = tasks.get_task_ids(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE)) config = marathon.get_config(SERVICE_NAME) broker_cpus = int(config['env']['BROKER_CPUS']) config['env']['BROKER_CPUS'] = str(broker_cpus + 0.1) marathon.update_app(SERVICE_NAME, config) tasks.check_tasks_updated(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE), broker_ids) # all tasks are running tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)
def test_bump_node_counts(): # Run this test last, as it changes the task count marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME) data_nodes = int(marathon_config['env']['DATA_NODE_COUNT']) marathon_config['env']['DATA_NODE_COUNT'] = str(data_nodes + 1) ingest_nodes = int(marathon_config['env']['INGEST_NODE_COUNT']) marathon_config['env']['INGEST_NODE_COUNT'] = str(ingest_nodes + 1) coordinator_nodes = int(marathon_config['env']['COORDINATOR_NODE_COUNT']) marathon_config['env']['COORDINATOR_NODE_COUNT'] = str( coordinator_nodes + 1) sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config) sdk_tasks.check_running(FOLDERED_SERVICE_NAME, config.DEFAULT_TASK_COUNT + 3)
def test_changing_discovery_replaces_certificate_sans(): """ Update service configuration to change discovery prefix of a task. Scheduler should update task and new SANs should be generated. """ # Load end-entity certificate from PEM encoded file _, stdout, _ = sdk_cmd.service_task_exec( config.SERVICE_NAME, "discovery-0-node", "cat server.crt" ) log.info("first server.crt: {}".format(stdout)) ascii_cert = stdout.encode("ascii") log.info("first server.crt ascii encoded: {}".format(ascii_cert)) end_entity_cert = x509.load_pem_x509_certificate(ascii_cert, DEFAULT_BACKEND) san_extension = end_entity_cert.extensions.get_extension_for_oid( ExtensionOID.SUBJECT_ALTERNATIVE_NAME ) sans = [san.value for san in san_extension.value._general_names._general_names] expected_san = "{name}-0.{service_name}.autoip.dcos.thisdcos.directory".format( name=DISCOVERY_TASK_PREFIX, service_name=config.SERVICE_NAME ) assert expected_san in sans # Run task update with new discovery prefix marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config["env"]["DISCOVERY_TASK_PREFIX"] = DISCOVERY_TASK_PREFIX + "-new" sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) _, stdout, _ = sdk_cmd.service_task_exec( config.SERVICE_NAME, "discovery-0-node", "cat server.crt" ) log.info("second server.crt: {}".format(stdout)) ascii_cert = stdout.encode("ascii") log.info("second server.crt ascii encoded: {}".format(ascii_cert)) new_cert = x509.load_pem_x509_certificate(ascii_cert, DEFAULT_BACKEND) san_extension = new_cert.extensions.get_extension_for_oid(ExtensionOID.SUBJECT_ALTERNATIVE_NAME) sans = [san.value for san in san_extension.value._general_names._general_names] expected_san = "{name}-0.{service_name}.autoip.dcos.thisdcos.directory".format( name=DISCOVERY_TASK_PREFIX + "-new", service_name=config.SERVICE_NAME ) assert expected_san in sans
def test_bump_node_counts(): # bump ingest and coordinator, but NOT data, which is bumped in the following test. # we want to avoid adding two data nodes because the cluster sometimes won't have enough room for it marathon_config = sdk_marathon.get_config(foldered_name) ingest_nodes = int(marathon_config['env']['INGEST_NODE_COUNT']) marathon_config['env']['INGEST_NODE_COUNT'] = str(ingest_nodes + 1) coordinator_nodes = int(marathon_config['env']['COORDINATOR_NODE_COUNT']) marathon_config['env']['COORDINATOR_NODE_COUNT'] = str(coordinator_nodes + 1) sdk_marathon.update_app(foldered_name, marathon_config) sdk_plan.wait_for_completed_deployment(foldered_name) global current_expected_task_count current_expected_task_count += 2 sdk_tasks.check_running(foldered_name, current_expected_task_count) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)