Beispiel #1
0
    def start_monitoring(self,
                         buffer_size: Size = None,
                         number_of_subbuffers: int = None):
        if self.blktrace_pid != -1:
            raise Exception(
                f"blktrace already running with PID: {self.blktrace_pid}")

        self.__outputDirectoryPath = Directory.create_temp_directory(
        ).full_path

        drop_caches(DropCachesMode.ALL)

        number_of_subbuffers = ("" if number_of_subbuffers is None else
                                f" --num-sub-buffers={number_of_subbuffers}")
        buffer_size = (
            "" if buffer_size is None else
            f" --buffer-size={buffer_size.get_value(Unit.KibiByte)}")
        command = (
            f"blktrace{number_of_subbuffers}{buffer_size} --dev={self.device.path}"
            f"{self.masks} --output={PREFIX} --output-dir={self.__outputDirectoryPath}"
        )
        echo_output = TestRun.executor.run_expect_success(
            f"nohup {command} </dev/null &>{self.__outputDirectoryPath}/out & echo $!"
        )
        self.blktrace_pid = int(echo_output.stdout)
        TestRun.LOGGER.info(
            f"blktrace monitoring for device {self.device.path} started"
            f" (PID: {self.blktrace_pid}, output dir: {self.__outputDirectoryPath}"
        )
Beispiel #2
0
async def test_data_integrity_unplug(cache_mode):
    """
        title: Test if data integrity is maintained in a power failure scenario.
        description: |
          The test checks if the data written to the cache device is saved correctly in a power
          failure scenario, which is simulated by unplugging the cache device.
          FIO is interrupted when the cache device is unplugged. The test determines how many
          writes each FIO job was able to perform before the unplug and then checks if the data
          on the cache device matches FIO output up to the unplug (bearing in mind that the last
          write might have been interrupted).
        pass_criteria:
          - No system crash.
          - Data on the cache device are consistent with the data sent from FIO.
    """
    global fio_seed, tmp_dir, ram_disk
    cache_dev = TestRun.disks["cache"]
    core_dev = TestRun.disks["core"]

    sleep_max_s = timedelta(seconds=10)

    with TestRun.step("Test prepare"):
        random.seed(TestRun.random_seed)
        fio_seed = random.randint(0, 2**32)
        TestRun.LOGGER.info(f"FIO seed: {fio_seed}")
        tmp_dir = Directory.create_temp_directory()
        TestRun.LOGGER.info(f"Temporary directory: {tmp_dir.full_path}")
        ram_disk = RamDisk.create(Size(1, Unit.GiB), 1)[0]

        # csums[j][i] is csum for i-th io of j-th job
        csums = [{} for _ in range(num_jobs)]

    with TestRun.step("Test iterations:"):
        for cache_line_size in TestRun.iteration(CacheLineSize):
            with TestRun.step("Prefill the core device."):
                write_device(core_dev.path)
                data_prefill_cs = read_device_md5s(core_dev.path)

            # csums_rev is a reverse mapping to identify job, sector and seqno of I/O
            # with given csum
            csums_rev = {}
            for j in range(num_jobs):
                for b in range(job_workset_blocks):
                    cs = data_prefill_cs[j][b]
                    csums_rev[cs] = get_data_name(j, b, -1)

            with TestRun.step(
                    "Start a cache, add a core and set cache cleaning policy to NOP"
            ):
                cache = casadm.start_cache(cache_dev,
                                           cache_mode,
                                           cache_line_size,
                                           force=True)
                exported_object = cache.add_core(core_dev)
                cache.set_cleaning_policy(CleaningPolicy.nop)

            with TestRun.step("Start FIO to the exported object"):
                fio = prepare_base_fio() \
                    .target(exported_object.path) \
                    .run_time(100 * sleep_max_s)
                for i in range(num_jobs):
                    fio.add_job(f"di_{i}") \
                       .offset(job_workset_size * i) \
                       .io_size(Size(100, Unit.GiB))

                fio_task = start_async_func(fio.fio.run)

            with TestRun.step("Hot unplug the cache device after random time"):
                wait_time_s = random.randint(5,
                                             int(sleep_max_s.total_seconds()))
                sleep(wait_time_s)
                cache_dev.unplug()

            with TestRun.step("Analyze FIO execution after hot unplug"):
                fio_output = await fio_task
                if fio_output.exit_code == 0:
                    TestRun.LOGGER.warning(
                        "Unexpectedly successful fio - check if the device was unplugged correctly."
                    )
                results = fio.get_results(
                    TestRun.executor.run(f"cat {fio.fio.fio_file}").stdout)
                ios = [r.job.write.total_ios for r in results]

            with TestRun.step("Stop cache without flushing data"):
                try:
                    cache.stop(no_data_flush=True)
                except CmdException as e:
                    if not cli_messages.check_stderr_msg(
                            e.output, cli_messages.stop_cache_errors):
                        raise

            with TestRun.step("Plug back the cache device"):
                cache_dev.plug()

            with TestRun.step("Load cache"):
                cache = casadm.load_cache(cache_dev)

            with TestRun.step("Check data"):
                csums_actual = read_device_md5s(exported_object.path)

                # The last I/O in each job is interrupted by the unplug. It could have made it
                # to the medium or not. So the last I/O we expect to actually hit the disk
                # is 'num_io-2' or 'num_io-1' for each job. Below 'n1_' refers to 'num_io-1'
                # and 'n2_' refers to 'num_io-2'

                # seqno[j] is the last I/O seqno for given job (entire workset)
                n2_seqno = [io - 2 for io in ios]
                n1_seqno = [io - 1 for io in ios]

                # pattern[j][b] is the last I/O seqno for job j block b
                n2_pattern = get_pattern(n2_seqno)
                n1_pattern = get_pattern(n1_seqno)

                # Make sure we know data checksums for I/O that we expect to have
                # been committed assuming either n2_seqno or n1_seqno is the last
                # I/O committed by each job.
                gen_csums(ram_disk.path, n1_seqno, n1_pattern, csums,
                          csums_rev)
                gen_csums(ram_disk.path, n2_seqno, n2_pattern, csums,
                          csums_rev)

                fail = False
                for j in range(num_jobs):
                    for b in range(job_workset_blocks):
                        # possible checksums assuming n2_pattern or n1_pattern
                        cs_n2 = get_data_csum(j, b, n2_pattern,
                                              data_prefill_cs, csums)
                        cs_n1 = get_data_csum(j, b, n1_pattern,
                                              data_prefill_cs, csums)

                        # actual checksum read from CAS
                        cs_actual = csums_actual[j][b]

                        if cs_actual != cs_n2 and cs_actual != cs_n1:
                            fail = True

                            # attempt to identify erroneous data by comparing its checksum
                            # against the known checksums
                            identity = csums_rev[cs_actual] if cs_actual in csums_rev else \
                                f"UNKNOWN ({cs_actual[:8]})"

                            TestRun.LOGGER.error(
                                f"MISMATCH job {j} block {b} contains {identity} "
                                f"expected {get_data_name(j, b, n2_pattern[j][b])} "
                                f"or {get_data_name(j, b, n1_pattern[j][b]) }")

                if fail:
                    break

                cache.stop(no_data_flush=True)