def update_image_successful(install_image, regenerate_image_id=True, signed=False, skip_reboot_verification=False, expected_mender_clients=1, pre_upload_callback=lambda: None, pre_deployment_callback=lambda: None, deployment_triggered_callback=lambda: None, compression_type="gzip"): """ Perform a successful upgrade, and assert that deployment status/logs are correct. A reboot is performed, and running partitions have been swapped. Deployment status will be set as successful for device. Logs will not be retrieved, and result in 404. """ previous_inactive_part = Helpers.get_passive_partition() with Helpers.RebootDetector() as reboot: deployment_id, expected_image_id = common_update_procedure( install_image, regenerate_image_id, signed=signed, pre_deployment_callback=pre_deployment_callback, deployment_triggered_callback=deployment_triggered_callback, compression_type=compression_type) reboot.verify_reboot_performed() with Helpers.RebootDetector() as reboot: try: assert Helpers.get_active_partition() == previous_inactive_part except AssertionError: logs = [] for d in adm.get_devices(): logs.append(deploy.get_logs(d["device_id"], deployment_id)) pytest.fail( "device did not flip partitions during update, here are the device logs:\n\n %s" % (logs)) deploy.check_expected_statistics(deployment_id, "success", expected_mender_clients) for d in adm.get_devices(): deploy.get_logs(d["device_id"], deployment_id, expected_status=404) if not skip_reboot_verification: reboot.verify_reboot_not_performed() assert Helpers.yocto_id_installed_on_machine() == expected_image_id deploy.check_expected_status("finished", deployment_id) # make sure backend recognizes signed and unsigned images artifact_id = deploy.get_deployment(deployment_id)["artifacts"][0] artifact_info = deploy.get_artifact_details(artifact_id) assert artifact_info[ "signed"] is signed, "image was not correct recognized as signed/unsigned" return deployment_id
def update_image_failed(install_image="broken_update.ext4", expected_mender_clients=1): """ Perform a upgrade using a broken image (random data) The device will reboot, uboot will detect this is not a bootable image, and revert to the previous partition. The resulting upgrade will be considered a failure. """ devices_accepted = get_mender_clients() original_image_id = Helpers.yocto_id_installed_on_machine() previous_active_part = Helpers.get_active_partition() with Helpers.RebootDetector() as reboot: deployment_id, _ = common_update_procedure(install_image, broken_image=True) reboot.verify_reboot_performed() with Helpers.RebootDetector() as reboot: assert Helpers.get_active_partition() == previous_active_part deploy.check_expected_statistics(deployment_id, "failure", expected_mender_clients) for d in adm.get_devices(): assert "got invalid entrypoint into the state machine" in deploy.get_logs(d["device_id"], deployment_id) assert Helpers.yocto_id_installed_on_machine() == original_image_id reboot.verify_reboot_not_performed() deploy.check_expected_status("finished", deployment_id)
def test_update_image_id_already_installed( self, install_image=conftest.get_valid_image()): """Uploading an image with an incorrect name set results in failure and rollback.""" if not env.host_string: execute(self.test_update_image_id_already_installed, hosts=get_mender_clients(), install_image=install_image) return with Helpers.RebootDetector() as reboot: deployment_id, expected_image_id = common_update_procedure( install_image, True) reboot.verify_reboot_performed() devices_accepted_id = [ device["id"] for device in auth_v2.get_devices_status("accepted") ] deployment_id = deploy.trigger_deployment( name="New valid update", artifact_name=expected_image_id, devices=devices_accepted_id) deploy.check_expected_statistics(deployment_id, "already-installed", len(get_mender_clients())) deploy.check_expected_status("finished", deployment_id)
def test_reject_bootstrap(self): """Make sure a rejected device does not perform an upgrade, and that it gets it's auth token removed""" if not env.host_string: execute(self.test_reject_bootstrap, hosts=get_mender_clients()) return # iterate over devices and reject them for device in adm.get_devices(): adm.set_device_status(device["id"], "rejected") logging.info("Rejecting DeviceID: %s" % device["id"]) adm.check_expected_status("rejected", len(get_mender_clients())) with Helpers.RebootDetector() as reboot: try: deployment_id, _ = common_update_procedure( install_image=conftest.get_valid_image()) except AssertionError: logging.info("Failed to deploy upgrade to rejected device.") reboot.verify_reboot_not_performed() else: # use assert to fail, so we can get backend logs assert False, "No error while trying to deploy to rejected device" # authtoken has been removed from mender-store run("strings /data/mender/mender-store | grep -q 'authtoken' || false") # re-accept device after test is done adm.accept_devices(1)
def test_deployed_during_network_outage( self, install_image=conftest.get_valid_image()): """ Install a valid upgrade image while there is no network availability on the device Re-establishing the network connectivity results in the upgrade to be triggered. Emulate a flaky network connection, and ensure that the deployment still succeeds. """ if not env.host_string: execute(self.test_deployed_during_network_outage, hosts=get_mender_clients(), install_image=install_image) return Helpers.gateway_connectivity(False) with Helpers.RebootDetector() as reboot: deployment_id, expected_yocto_id = common_update_procedure( install_image, verify_status=False) time.sleep(60) for i in range(5): time.sleep(5) Helpers.gateway_connectivity(i % 2 == 0) Helpers.gateway_connectivity(True) logging.info("Network stabilized") reboot.verify_reboot_performed() deploy.check_expected_statistics(deployment_id, "success", len(get_mender_clients())) assert Helpers.yocto_id_installed_on_machine() == expected_yocto_id
def test_image_download_retry_hosts_broken( self, install_image=conftest.get_valid_image()): """ Block storage host (minio) by modifying the hosts file. """ if not env.host_string: execute(self.test_image_download_retry_hosts_broken, hosts=get_mender_clients(), install_image=install_image) return inactive_part = Helpers.get_passive_partition() run("echo '1.1.1.1 s3.docker.mender.io' >> /etc/hosts" ) # break s3 connectivity before triggering deployment with Helpers.RebootDetector() as reboot: deployment_id, new_yocto_id = common_update_procedure( install_image) self.wait_for_download_retry_attempts() run("sed -i.bak '/1.1.1.1/d' /etc/hosts") reboot.verify_reboot_performed() assert Helpers.get_active_partition() == inactive_part assert Helpers.yocto_id_installed_on_machine() == new_yocto_id reboot.verify_reboot_not_performed()
def test_large_update_image(self): """Installing an image larger than the passive/active parition size should result in a failure.""" if not env.host_string: execute(self.test_large_update_image, hosts=get_mender_clients()) return with Helpers.RebootDetector() as reboot: deployment_id, _ = common_update_procedure(install_image="large_image.dat", regenerate_image_id=False, broken_image=True) deploy.check_expected_statistics(deployment_id, "failure", len(get_mender_clients())) reboot.verify_reboot_not_performed() deploy.check_expected_status("finished", deployment_id)
def abort_deployment(self, abort_step=None, mender_performs_reboot=False): """ Trigger a deployment, and cancel it within 15 seconds, make sure no deployment is performed. Args: mender_performs_reboot: if set to False, a manual reboot is performed and checks are performed. if set to True, wait until device is rebooted. """ if not env.host_string: execute(self.abort_deployment, abort_step=abort_step, mender_performs_reboot=mender_performs_reboot, hosts=get_mender_clients()) return install_image = conftest.get_valid_image() expected_partition = Helpers.get_active_partition() expected_image_id = Helpers.yocto_id_installed_on_machine() with Helpers.RebootDetector() as reboot: deployment_id, _ = common_update_procedure(install_image, verify_status=False) if abort_step is not None: deploy.check_expected_statistics(deployment_id, abort_step, len(get_mender_clients())) deploy.abort(deployment_id) deploy.check_expected_statistics(deployment_id, "aborted", len(get_mender_clients())) # no deployment logs are sent by the client, is this expected? for d in auth_v2.get_devices(): deploy.get_logs(d["id"], deployment_id, expected_status=404) if mender_performs_reboot: # If Mender performs reboot, we need to wait for it to reboot # back into the original filesystem. reboot.verify_reboot_performed(number_of_reboots=2) else: # Else we reboot ourselves, just to make sure that we have not # unintentionally switched to the new partition. reboot.verify_reboot_not_performed() run("( sleep 10 ; reboot ) 2>/dev/null >/dev/null &") reboot.verify_reboot_performed() assert Helpers.get_active_partition() == expected_partition assert Helpers.yocto_id_installed_on_machine() == expected_image_id deploy.check_expected_status("finished", deployment_id)
def test_reject_bootstrap(self): """Make sure a rejected device does not perform an upgrade, and that it gets it's auth token removed""" if not env.host_string: execute(self.test_reject_bootstrap, hosts=get_mender_clients()) return # iterate over devices and reject them for device in adm.get_devices(): adm.set_device_status(device["id"], "rejected") logging.info("Rejecting DeviceID: %s" % device["id"]) adm.check_expected_status("rejected", len(get_mender_clients())) with Helpers.RebootDetector() as reboot: try: deployment_id, _ = common_update_procedure( install_image=conftest.get_valid_image()) except AssertionError: logging.info("Failed to deploy upgrade to rejected device.") reboot.verify_reboot_not_performed() else: # use assert to fail, so we can get backend logs pytest.fail( "no error while trying to deploy to rejected device") return finished = False # wait until auththoken is removed from file for _ in range(10): with settings(abort_exception=Exception): try: run("journalctl -u mender -l -n 3 | grep -q 'authentication request rejected'" ) except: time.sleep(30) else: finished = True break adm.accept_devices(1) if not finished: pytest.fail("failed to remove authtoken from mender-store file")
def test_deployment_abortion_success(self): # maybe an acceptance test is enough for this check? if not env.host_string: execute(self.test_deployment_abortion_success, hosts=get_mender_clients()) return install_image = conftest.get_valid_image() with Helpers.RebootDetector() as reboot: deployment_id, _ = common_update_procedure(install_image) reboot.verify_reboot_performed() deploy.check_expected_statistics(deployment_id, "success", len(get_mender_clients())) time.sleep(5) deploy.abort_finished_deployment(deployment_id) deploy.check_expected_statistics(deployment_id, "success", len(get_mender_clients())) deploy.check_expected_status("finished", deployment_id)
def test_update_image_recovery(self, install_image=conftest.get_valid_image()): """ Install an update, and reboot the system when we detect it's being copied over to the inactive parition. The test should result in a failure. """ if not env.host_string: execute(self.test_update_image_recovery, hosts=get_mender_clients(), install_image=install_image) return installed_yocto_id = Helpers.yocto_id_installed_on_machine() inactive_part = Helpers.get_passive_partition() with Helpers.RebootDetector() as reboot: deployment_id, _ = common_update_procedure(install_image) active_part = Helpers.get_active_partition() for i in range(60): time.sleep(0.5) with quiet(): # make sure we are writing to the inactive partition output = run("fuser -mv %s" % (inactive_part)) if output.return_code == 0: run("killall -s 9 mender") with settings(warn_only=True): run("( sleep 3 ; reboot ) 2>/dev/null >/dev/null &") break logging.info("Waiting for system to finish reboot") reboot.verify_reboot_performed() assert Helpers.get_active_partition() == active_part deploy.check_expected_statistics(deployment_id, "failure", len(get_mender_clients())) reboot.verify_reboot_not_performed() assert Helpers.yocto_id_installed_on_machine() == installed_yocto_id
def test_update_image_breaks_networking( self, install_image="core-image-full-cmdline-%s-broken-network.ext4" % conftest.machine_name): """ Install an image without systemd-networkd binary existing. The network will not function, mender will not be able to send any logs. The expected status is the update will rollback, and be considered a failure """ if not env.host_string: execute(self.test_update_image_breaks_networking, hosts=get_mender_clients(), install_image=install_image) return with Helpers.RebootDetector() as reboot: deployment_id, _ = common_update_procedure(install_image) reboot.verify_reboot_performed( ) # since the network is broken, two reboots will be performed, and the last one will be detected deploy.check_expected_statistics(deployment_id, "failure", len(get_mender_clients()))
def test_reboot_recovery(self, description, test_set): if not env.host_string: execute(self.test_reboot_recovery, description, test_set, hosts=get_mender_clients()) return client = env.host_string work_dir = "test_state_scripts.%s" % client script_content = '#!/bin/sh\n\necho "$(basename $0)" >> /data/test_state_scripts.log\n' script_failure_content = script_content + 'sync\necho b > /proc/sysrq-trigger\n' # flush to disk before killing # This is only needed in the case: die commit-leave, # otherwise the device will get stuck in a boot-reboot loop script_reboot_once = ('''#!/bin/sh if [ $(grep -c $(basename $0) /data/test_state_scripts.log) -eq 0 ]; then echo "$(basename $0)" >> /data/test_state_scripts.log && sync && echo b > /proc/sysrq-trigger fi echo "$(basename $0)" >> /data/test_state_scripts.log exit 0''') script_error_content = script_content + "exit 1" broken_image = test_set.get("Rollback", False) # Put artifact-scripts in the artifact. artifact_script_dir = os.path.join(work_dir, "artifact-scripts") if os.path.exists(work_dir): shutil.rmtree(work_dir, ignore_errors=True) os.mkdir(work_dir) os.mkdir(artifact_script_dir) new_rootfs = os.path.join(work_dir, "rootfs.ext4") shutil.copy(conftest.get_valid_image(), new_rootfs) ps = subprocess.Popen(["debugfs", "-w", new_rootfs], stdin=subprocess.PIPE) ps.stdin.write("cd /etc/mender\n" "mkdir scripts\n" "cd scripts\n") ps.stdin.close() ps.wait() for script in test_set.get("ScriptOrder"): if not script.startswith("Artifact"): # Not an artifact script, skip this one. continue with open(os.path.join(artifact_script_dir, script), "w") as fd: if script in test_set.get("RebootScripts", []): fd.write(script_failure_content) if script in test_set.get("RebootOnceScripts", []): fd.write(script_reboot_once) elif script in test_set.get("ErrorScripts", []): fd.write(script_error_content) else: fd.write(script_content) # Now create the artifact, and make the deployment. device_id = Helpers.ip_to_device_id_map([client])[client] with Helpers.RebootDetector() as reboot_detector: deployment_id = common_update_procedure( install_image=new_rootfs, broken_image=broken_image, verify_status=True, devices=[device_id], scripts=[artifact_script_dir])[0] try: orig_part = Helpers.get_active_partition() # handle case where the client has not finished the update # path on the committed partition, but new partition is installed, # thus we will not get a valid entrypoint into the uncommitted parition(reboot_leave) # and the client will thus reboot straight after starting, and u-boot will # fall back to the committed partition if test_set.get("DoubleReboot", False): reboot_detector.verify_reboot_performed( number_of_reboots=2) else: reboot_detector.verify_reboot_performed() # wait until the last script has been run logger.debug("waint until the last script has been run") script_logs = "" timeout = time.time() + 60 * 60 while timeout >= time.time(): time.sleep(3) script_logs = run("cat /data/test_state_scripts.log") if test_set.get("ExpectedScriptFlow")[-1] in script_logs: break # make sure the client ended up on the right partition if "OtherPartition" in test_set.get("ExpectedFinalPartition", []): assert orig_part != Helpers.get_active_partition() else: assert orig_part == Helpers.get_active_partition() assert script_logs.split() == test_set.get( "ExpectedScriptFlow") finally: run("systemctl stop mender && " + "rm -f /data/test_state_scripts.log && " + "rm -rf /etc/mender/scripts && " + "rm -rf /data/mender/scripts && " + "systemctl start mender")
def test_image_download_retry_timeout( self, test_set, install_image=conftest.get_valid_image()): """ Install an update, and block storage connection when we detect it's being copied over to the inactive parition. The test should result in a successful download retry. """ if not env.host_string: execute(self.test_image_download_retry_timeout, test_set, hosts=get_mender_clients(), install_image=install_image) return # make tcp timeout quicker, none persistent changes run("echo 2 > /proc/sys/net/ipv4/tcp_keepalive_time") run("echo 2 > /proc/sys/net/ipv4/tcp_keepalive_intvl") run("echo 3 > /proc/sys/net/ipv4/tcp_syn_retries") # to speed up timeouting client connection run("echo 1 > /proc/sys/net/ipv4/tcp_keepalive_probes") inactive_part = Helpers.get_passive_partition() with Helpers.RebootDetector() as reboot: if test_set['blockAfterStart']: # Block after we start the download. deployment_id, new_yocto_id = common_update_procedure( install_image) for _ in range(60): time.sleep(0.5) with quiet(): # make sure we are writing to the inactive partition output = run("fuser -mv %s" % (inactive_part)) if output.return_code == 0: break else: pytest.fail("Download never started?") # use iptables to block traffic to storage Helpers.gateway_connectivity(False, hosts=["s3.docker.mender.io" ]) # disable connectivity if not test_set['blockAfterStart']: # Block before we start the download. deployment_id, new_yocto_id = common_update_procedure( install_image) # re-enable connectivity after 2 retries self.wait_for_download_retry_attempts( test_set['logMessageToLookFor']) Helpers.gateway_connectivity(True, hosts=["s3.docker.mender.io" ]) # re-enable connectivity reboot.verify_reboot_performed() assert Helpers.get_active_partition() == inactive_part assert Helpers.yocto_id_installed_on_machine() == new_yocto_id reboot.verify_reboot_not_performed()
def test_update_device_group(self): """ Perform a successful upgrade on one group of devices, and assert that: * deployment status/logs are correct. * only the correct group is updated, not the other one. A reboot is performed, and running partitions have been swapped. Deployment status will be set as successful for device. Logs will not be retrieved, and result in 404. """ # Beware that there will two parallel things going on below, one for # each group, hence a lot of separate execute() calls for each. We aim # to update the group alpha, not beta. clients = get_mender_clients() assert (len(clients) == 2) alpha = clients[0] bravo = clients[1] ip_to_device_id = Helpers.ip_to_device_id_map(clients) id_alpha = ip_to_device_id[alpha] id_bravo = ip_to_device_id[bravo] print("ID of alpha host: %s\nID of bravo host: %s" % (id_alpha, id_bravo)) ret = execute(Helpers.get_passive_partition, hosts=clients) pass_part_alpha = ret[alpha] pass_part_bravo = ret[bravo] inv.put_device_in_group(id_alpha, "Update") reboot = {alpha: None, bravo: None} with Helpers.RebootDetector(alpha) as reboot[ alpha], Helpers.RebootDetector(bravo) as reboot[bravo]: deployment_id, expected_image_id = common_update_procedure( conftest.get_valid_image(), devices=[id_alpha]) @parallel def verify_reboot_performed_for_alpha_only(): if env.host_string == alpha: reboot[alpha].verify_reboot_performed() elif env.host_string == bravo: # Extra long wait here, because a real update takes quite a lot # of time. reboot[bravo].verify_reboot_not_performed(300) else: raise Exception( "verify_reboot_performed_for_alpha_only() called with unknown host" ) execute(verify_reboot_performed_for_alpha_only, hosts=clients) ret = execute(Helpers.get_passive_partition, hosts=clients) assert ret[alpha] != pass_part_alpha assert ret[bravo] == pass_part_bravo ret = execute(Helpers.get_active_partition, hosts=clients) assert ret[alpha] == pass_part_alpha assert ret[bravo] != pass_part_bravo deploy.check_expected_statistics(deployment_id, expected_status="success", expected_count=1) # No logs for either host: alpha because it was successful, bravo # because it should never have attempted an update in the first place. for id in [id_alpha, id_bravo]: deploy.get_logs(id, deployment_id, expected_status=404) assert execute(Helpers.yocto_id_installed_on_machine, hosts=alpha)[alpha] == expected_image_id assert execute(Helpers.yocto_id_installed_on_machine, hosts=bravo)[bravo] != expected_image_id # Important: Leave the groups as you found them: Empty. inv.delete_device_from_group(id_alpha, "Update")