Esempio n. 1
0
def test_pipeline_checkpoint_sensitivity_effect_perspective(pl_name, tmpdir):
    """ The pipeline skips execution of stages with extant checkpoint. """

    # Create the pipeline, then check creation of output file.
    pipeline = get_pipeline(pl_name, tmpdir.strpath)
    output_file = os.path.join(pipeline.outfolder, pipeline.name_output_file)
    assert not os.path.exists(output_file)
    pipeline.run()
    assert os.path.isfile(output_file)

    # Validate pipeline effects (output file content).
    with open(output_file, 'r') as f:
        lines = f.readlines()
    assert [s.name + os.linesep for s in pipeline.stages()] == lines

    # Verify presence of checkpoint files to support our expectation about
    # which stages should be skipped and which should be run during the second
    # time through the pipeline's execution.
    exp_cp_fpaths = set(
        checkpoint_filepath(s.name, pipeline.manager)
        for s in pipeline.stages())
    assert exp_cp_fpaths == set(fetch_checkpoint_files(pipeline.manager))
    final_stage = pipeline.stages()[-1]
    final_stage_fpath = checkpoint_filepath(final_stage.name, pipeline.manager)
    os.unlink(final_stage_fpath)

    # Verify the effect of the second execution of the pipeline.
    pipeline.run()
    with open(output_file, 'r') as f:
        lines = f.readlines()
    assert [final_stage.name + os.linesep] == lines
Esempio n. 2
0
def test_pipeline_reruns_downstream_stages_according_to_parameterization(
        overwrite, pl_name, tmpdir):
    """ Pipeline overwrites downstream stages unless configured otherwise. """

    pl = get_pipeline(pl_name, tmpdir.strpath)

    # Create checkpoint file for each stage.
    stage_names = [s.name for s in pl.stages()]
    assert 1 < len(stage_names), \
            "Need pipeline with at least two stages to run this test."
    for s_name in stage_names:
        open(checkpoint_filepath(s_name, pl.manager), 'w').close()

    # Remove the checkpoint file for the penultimate stage.
    penultimate_stage = stage_names[-2]
    os.unlink(checkpoint_filepath(penultimate_stage, pl.manager))

    # Configure the pipeline based on parameterization and run it starting
    # from the penultimate stage.
    pl.manager.overwrite_checkpoints = overwrite
    pl.run(start_point=penultimate_stage)

    # If we're overwriting downstream checkpoints, the last two stages are
    # run while otherwise only the penultimate stage is run.
    exp_stages = [stage_names[-2]]
    if overwrite:
        exp_stages.append(stage_names[-1])
    exp_lines = [func + os.linesep for func in stage_names[-2:]]
    outpath = os.path.join(pl.outfolder, pl.name_output_file)
    with open(outpath, 'r') as f:
        obs_lines = f.readlines()
    assert exp_lines == obs_lines
Esempio n. 3
0
def test_pipeline_checkpoint_respect_sensitivity_and_specificity(tmpdir):
    """ Pipeline respects only its own checkpoint(s) for stage skipping. """

    # Note: conceptually, this is more of an effect- or outcome-based test
    # of the checkpointing system with respect to stage skipping.

    align_reads = get_read_aligner(tmpdir.strpath)
    call_peaks = get_peak_caller(tmpdir.strpath)

    align_reads_stage_names = [s.name for s in align_reads.stages()]
    call_peaks_stage_names = [s.name for s in call_peaks.stages()]
    assert {"align_reads"} ==  \
           set(align_reads_stage_names) & set(call_peaks_stage_names)

    # Set up the checkpoints for the read alignment pipeline by allowing it
    # to execute once.
    align_reads.run()
    assert os.path.isfile(
        checkpoint_filepath("align_reads", align_reads.manager))
    peaks_align_check_fpath = \
            checkpoint_filepath("align_reads", call_peaks.manager)
    assert not os.path.isfile(peaks_align_check_fpath)

    call_peaks.run()
    exp_lines = [func + os.linesep for func in call_peaks_stage_names]
    call_peaks_outpath = os.path.join(call_peaks.outfolder,
                                      call_peaks.name_output_file)
    with open(call_peaks_outpath, 'r') as f:
        obs_lines = f.readlines()
    assert exp_lines == obs_lines
Esempio n. 4
0
def test_checkpoints_are_pipeline_unique(tmpdir):
    """ Names of checkpoint files depend on both stage and pipeline. """

    # Note: conceptually, this tests an underlying mechanistic aspect of the
    # checkpointing system.

    # Create two different pipelines.
    align_reads = get_read_aligner(tmpdir.strpath)
    call_peaks = get_peak_caller(tmpdir.strpath)

    # Get the stage names associated with each pipeline.
    alignment_stage_names = set(map(lambda s: s.name, align_reads.stages()))
    peak_call_stage_names = set(map(lambda s: s.name, call_peaks.stages()))

    # Check that we have one specific stage name shared between the pipelines.
    assert {"align_reads"} == alignment_stage_names & peak_call_stage_names
    assert align_reads.outfolder == call_peaks.outfolder

    # We begin with no checkpoint files.
    assert [] == list(fetch_checkpoint_files(align_reads.manager))
    assert [] == list(fetch_checkpoint_files(call_peaks.manager))

    # Run each pipeline.
    align_reads.run()
    call_peaks.run()

    # We expect a different checkpoint file for each stage of each pipeline.
    align_reads_expected = {
        checkpoint_filepath(s.name, align_reads)
        for s in align_reads.stages()
    }
    call_peaks_expected = {
        checkpoint_filepath(s.name, call_peaks)
        for s in call_peaks.stages()
    }

    # Pipeline names are unique here, and each checkpoint name includes
    # pipeline name for disambiguation, so even a pair of pipelines with a
    # nonempty stage name intersection has an empty checkpoint filenames
    # intersection, so long as the pipeline names are unique.
    assert set() == (align_reads_expected & call_peaks_expected)

    # When not setting start/stop parameters and beginning with no checkpoint
    # files in place, each pipeline generates its full set of checkpoint files.
    expected_checkpoints = align_reads_expected | call_peaks_expected
    observed_checkpoints = set(fetch_checkpoint_files(align_reads)) | \
                           set(fetch_checkpoint_files(call_peaks))

    # Verify satisfaction of expectation.
    try:
        assert expected_checkpoints == observed_checkpoints
    except AssertionError:
        only_exp = expected_checkpoints - observed_checkpoints
        exp_and_obs = expected_checkpoints & observed_checkpoints
        only_obs = observed_checkpoints - expected_checkpoints
        print("Only in expected:\n{}".format("\n".join(only_exp)))
        print("Expected and observed:\n{}".format("\n".join(exp_and_obs)))
        print("Only in observed:\n{}".format("\n".join(only_obs)))
        raise
Esempio n. 5
0
def fetch_checkpoint_files(pm):
    """
    Fetch all of a manager's checkpoint file paths.

    :param pyiper.PipelineManager pm: manager for which checkpoint files'
        paths are of interest.
    :return Iterable[str]: collection of all of given manager's checkpoint
        files' paths.
    """
    pattern = checkpoint_filepath("*", pm)
    return glob.glob(pattern)
Esempio n. 6
0
 def test_initial_timestamp_checkpoint_file(self, get_pipe_manager,
                                            retrospective):
     """ Initial checkpointed timestamp writes checkpoint file if and only
     if it's a retrospective timestamp. """
     pm = get_pipe_manager(name="init-timestamp-file")
     stage_name = "align_reads"
     pm.timestamp(checkpoint=stage_name, finished=retrospective)
     check_fpath = checkpoint_filepath(stage_name, pm)
     if retrospective:
         assert os.path.isfile(check_fpath)
     else:
         assert not os.path.isfile(check_fpath)
Esempio n. 7
0
    def test_two_retrospective_checkpointed_timestamps(self, test_type,
                                                       stage_pair, pm):
        """ Retrospective timestamp generates file for current checkpoint. """

        stage1, stage2 = stage_pair
        pm.timestamp(checkpoint=stage1, finished=True)
        pm.timestamp(checkpoint=stage2, finished=True)

        if test_type == FILES_TEST:
            checkpoint_files = fetch_checkpoint_files(pm)
            expected = [checkpoint_filepath(s, pm) for s in [stage1, stage2]]
            assert set(expected) == set(checkpoint_files)
        else:
            assert stage2 == pm.prev_checkpoint
            assert pm.curr_checkpoint is None
Esempio n. 8
0
    def test_two_prospective_checkpointed_timestamps(self, test_type,
                                                     stage_pair, pm):
        """ Prospective timestamp generates file for previous checkpoint. """

        stage1, stage2 = stage_pair
        pm.timestamp(checkpoint=stage1, finished=False)
        pm.timestamp(checkpoint=stage2, finished=False)

        if test_type == FILES_TEST:
            checkpoint_files = fetch_checkpoint_files(pm)
            expected = [checkpoint_filepath(stage1, pm)]
            assert set(expected) == set(checkpoint_files)
        else:
            assert stage1 == pm.prev_checkpoint
            assert stage2 == pm.curr_checkpoint
Esempio n. 9
0
    def test_retrospective_the_prospective_checkpointed_timestamps(
            self, test_type, stage_pair, pm):
        """ Test retrospective timestamp followed by prospective one. """

        stage1, stage2 = stage_pair
        pm.timestamp(checkpoint=stage1, finished=True)
        assert stage1 == pm.prev_checkpoint
        assert pm.curr_checkpoint is None
        pm.timestamp(checkpoint=stage2, finished=False)

        if test_type == FILES_TEST:
            expected = [checkpoint_filepath(stage1, pm)]
            assert set(expected) == set(fetch_checkpoint_files(pm))
        else:
            assert pm.prev_checkpoint is None
            assert stage2 == pm.curr_checkpoint
Esempio n. 10
0
    def test_prospective_then_retrospective_checkpointed_timestamps(
            self, test_type, stage_pair, pm):
        """ If a prospective checkpointed timestamp is followed by a
        retrospective one, there's only a file for the retrospective one. """

        stage1, stage2 = stage_pair
        pm.timestamp(checkpoint=stage1, finished=False)
        assert stage1 == pm.curr_checkpoint
        pm.timestamp(checkpoint=stage2, finished=True)

        if test_type == FILES_TEST:
            checkpoint_files = fetch_checkpoint_files(pm)
            expected = [checkpoint_filepath(stage2, pm)]
            assert set(expected) == set(checkpoint_files)
        else:
            # Current checkpoint will be reset by second (retrospective)
            # timestamp call.
            assert stage2 == pm.prev_checkpoint
            assert pm.curr_checkpoint is None
Esempio n. 11
0
def test_pipeline_checkpoint_respect_sensitivity_checkpoint_perspective(
        pl_name, tmpdir):
    """ Pipeline can skip past its stage(s) for which checkpoint exists. """

    # Create the pipeline.
    pipeline = get_pipeline(pl_name, tmpdir.strpath)

    # Negative control to start test, that we have no checkpoint files.
    assert [] == fetch_checkpoint_files(pipeline.manager)

    # Generate some checkpoints.
    pipeline.run()

    # Verify that we created each of the checkpoints.
    expected = [
        checkpoint_filepath(f.__name__, pipeline.manager)
        for f in pipeline.functions
    ]
    observed = fetch_checkpoint_files(pipeline.manager)
    assert set(expected) == set(observed)

    # Collect checkpoint file timestamps for comparison after second run.
    timestamps = {f: os.path.getmtime(f) for f in observed}

    # Remove the checkpoint for the final stage.
    last_aligner_stage = pipeline.functions[-1]
    last_aligner_checkfile = checkpoint_filepath(last_aligner_stage,
                                                 pipeline.manager)
    os.unlink(last_aligner_checkfile)

    # Verify removal of final stage checkpoint file.
    assert all([os.path.isfile(f) for f in expected[:-1]])
    assert not os.path.exists(last_aligner_checkfile)
    assert set(expected) != set(fetch_checkpoint_files(pipeline.manager))

    # Delay briefly so that we can more reliably compare checkpoint file
    # timestamps after a second pipeline run.
    time.sleep(0.05)

    # Repeat the pipeline's execution, but now with checkpoint file(s) for a
    # subset of its stages in place.
    pipeline.run()

    # Verify that we've restored the full collection of the pipeline's
    # checkpoint files to existence.
    observed = fetch_checkpoint_files(pipeline.manager)
    exp = set(expected)
    obs = set(observed)
    assert set(expected) == set(observed), \
            "Expected only:\n{}\nExpected and observed:\n{}\nObserved only:\n{}".format(
                    exp - obs, exp & obs, obs - exp)

    # Verify the we didn't recreate the checkpoint file for each skipped stage.
    for f in expected[:-1]:
        expected_timestamp = timestamps[f]
        observed_timestamp = os.path.getmtime(f)
        assert expected_timestamp == observed_timestamp

    # Verify the we did in fact recreate the checkpoint file for the stage
    # that was rerun.
    assert os.path.getmtime(last_aligner_checkfile) > \
           timestamps[last_aligner_checkfile], \
            "Recreated checkpoint file ('{}') should be newer than original".\
           format(last_aligner_checkfile)