Ejemplo n.º 1
0
def test_sample_index():
    """Run through some basic testing of the SampleIndex class."""
    tests = [
        (10, 1, []),
        (10, 3, []),
        (11, 2, [5]),
        (10, 3, [3]),
        (10, 3, [1]),
        (10, 1, [3]),
        (10, 3, [1, 3]),
        (10, 1, [2]),
        (1000, 100, [500]),
        (1000, 50, [500, 100]),
        (1000000000, 100000132, []),
    ]

    for args in tests:
        print(f"############ TEST {args[0]} {args[1]} {args[2]} ###########")
        # put at root address of "0" to guarantee insertion at "0.1" later is valid
        idx = create_hierarchy(args[0], args[1], args[2], address="0")
        print(str(idx))
        try:
            idx["0.1"] = create_hierarchy(args[0], args[1], args[2], address="0.1")
            print("successful set")
            print(str(idx))
        except KeyError as error:
            print(error)
            assert False
Ejemplo n.º 2
0
def test_directory_writing():
    path = os.path.join(TEST_DIR)
    indx = create_hierarchy(2, 1, [1], root=path)
    expected = """: DIRECTORY MIN 0 MAX 2 NUM_BUNDLES 2
   0: DIRECTORY MIN 0 MAX 1 NUM_BUNDLES 1
      0.0: BUNDLE 0 MIN 0 MAX 1
   1: DIRECTORY MIN 1 MAX 2 NUM_BUNDLES 1
      1.0: BUNDLE 1 MIN 1 MAX 2
"""
    assert expected == str(indx)
    indx.write_directories()
    assert os.path.isdir(f"{TEST_DIR}/0")
    assert os.path.isdir(f"{TEST_DIR}/1")
    indx.write_multiple_sample_index_files()

    clear_test_tree()

    path = os.path.join(TEST_DIR)
    indx = create_hierarchy(1000000000, 10000, [100000000, 10000000], root=path)
    indx.write_directories()
    path = indx.get_path_to_sample(123000123)
    assert os.path.exists(os.path.dirname(path))
    assert path != TEST_DIR
    path = indx.get_path_to_sample(10000000000)
    assert path == TEST_DIR

    clear_test_tree()

    path = os.path.join(TEST_DIR)
    indx = create_hierarchy(1000000000, 10000, [100000000, 10000000, 1000000], root=path)
    indx.write_directories()
Ejemplo n.º 3
0
def test_directory_path():
    indx = create_hierarchy(20, 1, [20, 5, 1], root="")
    leaves = indx.make_directory_string()
    expected_leaves = "0/0/0 0/0/1 0/0/2 0/0/3 0/0/4 0/1/0 0/1/1 0/1/2 0/1/3 0/1/4 0/2/0 0/2/1 0/2/2 0/2/3 0/2/4 0/3/0 0/3/1 0/3/2 0/3/3 0/3/4"
    assert leaves == expected_leaves
    all_dirs = indx.make_directory_string(just_leaf_directories=False)
    expected_all_dirs = " 0 0/0 0/0/0 0/0/1 0/0/2 0/0/3 0/0/4 0/1 0/1/0 0/1/1 0/1/2 0/1/3 0/1/4 0/2 0/2/0 0/2/1 0/2/2 0/2/3 0/2/4 0/3 0/3/0 0/3/1 0/3/2 0/3/3 0/3/4"
    assert all_dirs == expected_all_dirs
Ejemplo n.º 4
0
def test_index_file_writing():
    indx = create_hierarchy(
        1000000000, 10000, [100000000, 10000000, 1000000], root=TEST_DIR
    )
    indx.write_directories()
    indx.write_multiple_sample_index_files()
    indx2 = read_hierarchy(TEST_DIR)
    assert indx2.get_path_to_sample(123000123) == indx.get_path_to_sample(123000123)
Ejemplo n.º 5
0
def test_bundle_retrieval():
    indx = create_hierarchy(1000000000, 10000, [100000000, 10000000, 1000000], root=TEST_DIR)
    expected = f"{TEST_DIR}/0/0/0/samples0-10000.ext"
    result = indx.get_path_to_sample(123)
    assert expected == result

    expected = f"{TEST_DIR}/0/0/0/samples10000-20000.ext"
    result = indx.get_path_to_sample(10000)
    assert expected == result

    expected = f"{TEST_DIR}/1/2/3/samples123000000-123010000.ext"
    result = indx.get_path_to_sample(123000123)
    assert expected == result
Ejemplo n.º 6
0
def test_subhierarchy_insertion():
    indx = create_hierarchy(2, 1, [1], root=TEST_DIR)
    print("Writing directories")
    indx.write_directories()
    indx.write_multiple_sample_index_files()
    print("reading heirarchy")
    top = read_hierarchy(os.path.abspath(TEST_DIR))
    expected = """: DIRECTORY MIN 0 MAX 2 NUM_BUNDLES 2
   0: DIRECTORY MIN 0 MAX 1 NUM_BUNDLES 1
      0.0: BUNDLE -1 MIN 0 MAX 1
   1: DIRECTORY MIN 1 MAX 2 NUM_BUNDLES 1
      1.0: BUNDLE -1 MIN 1 MAX 2
"""
    assert str(top) == expected
    print("creating sub_heirarchy")
    sub_h = create_hierarchy(100, 10, address="1.0")
    print("inserting sub_heirarchy")
    top["1.0"] = sub_h
    print(str(indx))
    print("after insertion")
    print(str(top))
    expected = """: DIRECTORY MIN 0 MAX 2 NUM_BUNDLES 2
   0: DIRECTORY MIN 0 MAX 1 NUM_BUNDLES 1
      0.0: BUNDLE -1 MIN 0 MAX 1
   1: DIRECTORY MIN 1 MAX 2 NUM_BUNDLES 1
      1.0: DIRECTORY MIN 0 MAX 100 NUM_BUNDLES 10
         1.0.0: BUNDLE 0 MIN 0 MAX 10
         1.0.1: BUNDLE 1 MIN 10 MAX 20
         1.0.2: BUNDLE 2 MIN 20 MAX 30
         1.0.3: BUNDLE 3 MIN 30 MAX 40
         1.0.4: BUNDLE 4 MIN 40 MAX 50
         1.0.5: BUNDLE 5 MIN 50 MAX 60
         1.0.6: BUNDLE 6 MIN 60 MAX 70
         1.0.7: BUNDLE 7 MIN 70 MAX 80
         1.0.8: BUNDLE 8 MIN 80 MAX 90
         1.0.9: BUNDLE 9 MIN 90 MAX 100
"""
    assert str(top) == expected
Ejemplo n.º 7
0
def test_start_sample_id():
    expected = """: DIRECTORY MIN 203 MAX 303 NUM_BUNDLES 10
   0: BUNDLE 0 MIN 203 MAX 213
   1: BUNDLE 1 MIN 213 MAX 223
   2: BUNDLE 2 MIN 223 MAX 233
   3: BUNDLE 3 MIN 233 MAX 243
   4: BUNDLE 4 MIN 243 MAX 253
   5: BUNDLE 5 MIN 253 MAX 263
   6: BUNDLE 6 MIN 263 MAX 273
   7: BUNDLE 7 MIN 273 MAX 283
   8: BUNDLE 8 MIN 283 MAX 293
   9: BUNDLE 9 MIN 293 MAX 303
"""
    idx203 = create_hierarchy(100, 10, start_sample_id=203)
    assert expected == str(idx203)
Ejemplo n.º 8
0
def expand_tasks_with_samples(
    self,
    dag,
    chain_,
    samples,
    labels,
    task_type,
    adapter_config,
    level_max_dirs,
    **kwargs,
):
    """
    Generate a group of celery chains of tasks from a chain of task names, using merlin
    samples and labels to do variable substitution.

    :param dag : A Merlin DAG.
    :param chain_ : The list of task names to expand into a celery group of celery chains.
    :param samples : The list of lists of merlin sample values to do substitution for.
    :labels : A list of strings containing the label associated with each column in the samples.
    :task_type : The celery task type to create. Currently always merlin_step.
    :adapter_config : A dictionary used for configuring maestro script adapters.
    :level_max_dirs : The max number of directories per level in the sample hierarchy.
    """
    LOG.debug(f"expand_tasks_with_samples called with chain,{chain_}\n")
    # Figure out how many directories there are, make a glob string
    directory_sizes = uniform_directories(len(samples),
                                          bundle_size=1,
                                          level_max_dirs=level_max_dirs)

    glob_path = "*/" * len(directory_sizes)

    LOG.debug("creating sample_index")
    # Write a hierarchy to get the all paths string
    sample_index = create_hierarchy(
        len(samples),
        bundle_size=1,
        directory_sizes=directory_sizes,
        root="",
        n_digits=len(str(level_max_dirs)),
    )

    LOG.debug("creating sample_paths")
    sample_paths = sample_index.make_directory_string()

    LOG.debug("assembling steps")
    # the steps in the chain
    steps = [dag.step(name) for name in chain_]

    # sub in globs prior to expansion
    # sub the glob command
    steps = [
        step.clone_changing_workspace_and_cmd(
            cmd_replacement_pairs=parameter_substitutions_for_cmd(
                glob_path, sample_paths)) for step in steps
    ]

    # workspaces = [step.get_workspace() for step in steps]
    # LOG.debug(f"workspaces : {workspaces}")

    needs_expansion = is_chain_expandable(steps, labels)

    LOG.debug(f"needs_expansion {needs_expansion}")

    if needs_expansion:
        # prepare_chain_workspace(sample_index, steps)
        sample_index.name = ""
        LOG.debug(f"queuing merlin expansion tasks")
        found_tasks = False
        conditions = [
            lambda c: c.is_great_grandparent_of_leaf,
            lambda c: c.is_grandparent_of_leaf,
            lambda c: c.is_parent_of_leaf,
            lambda c: c.is_leaf,
        ]
        for condition in conditions:
            if not found_tasks:
                for next_index_path, next_index in sample_index.traverse(
                        conditional=condition):
                    LOG.info(
                        f"generating next step for range {next_index.min}:{next_index.max} {next_index.max-next_index.min}"
                    )
                    next_index.name = next_index_path

                    sig = add_merlin_expanded_chain_to_chord.s(
                        task_type,
                        steps,
                        samples[next_index.min:next_index.max],
                        labels,
                        next_index,
                        adapter_config,
                        next_index.min,
                    )
                    sig.set(queue=steps[0].get_task_queue())

                    if self.request.is_eager:
                        sig.delay()
                    else:
                        LOG.info(
                            f"queuing expansion task {next_index.min}:{next_index.max}"
                        )
                        self.add_to_chord(sig, lazy=False)
                    LOG.info(
                        f"merlin expansion task {next_index.min}:{next_index.max} queued"
                    )
                    found_tasks = True
    else:
        LOG.debug(f"queuing simple chain task")
        add_simple_chain_to_chord(self, task_type, steps, adapter_config)
        LOG.debug(f"simple chain task queued")
Ejemplo n.º 9
0
def expand_tasks_with_samples(
    self,
    _,
    dag,
    chain_,
    samples,
    labels,
    task_type,
    adapter_config,
    level_max_dirs,
    **kwargs,
):
    """
    Generate a group of celery chains of tasks from a chain of task names, using merlin
    samples and labels to do variable substitution.

    :param dag : A Merlin DAG.
    :param chain_ : The list of task names to expand into a celery group of celery chains.
    :param samples : The list of lists of merlin sample values to do substitution for.
    :labels : A list of strings containing the label associated with each column in the samples.
    :task_type : The celery task type to create. Currently always merlin_step.
    :adapter_config : A dictionary used for configuring maestro script adapters.
    :level_max_dirs : The max number of directories per level in the sample hierarchy.
    """
    LOG.debug(f"expand_tasks_with_samples called with chain,{chain_}\n")
    # Figure out how many directories there are, make a glob string
    directory_sizes = uniform_directories(len(samples),
                                          bundle_size=1,
                                          level_max_dirs=level_max_dirs)
    directory_sizes.append(1)
    glob_path = "*/" * len(directory_sizes)

    # Write a hierarchy to get the all paths string
    sample_index = create_hierarchy(len(samples),
                                    bundle_size=1,
                                    directory_sizes=directory_sizes,
                                    root="")
    sample_paths = sample_index.make_directory_string()

    # the steps in the chain
    steps = [dag.step(name) for name in chain_]

    # sub in globs prior to expansion
    # sub the glob command
    steps = [
        step.clone_changing_workspace_and_cmd(
            cmd_replacement_pairs=parameter_substitutions_for_cmd(
                glob_path, sample_paths)) for step in steps
    ]

    workspaces = [step.get_workspace() for step in steps]
    LOG.debug(f"workspaces : {workspaces}")

    needs_expansion = is_chain_expandable(steps, labels)

    if needs_expansion:
        prepare_chain_workspace(sample_index, steps)
        sample_index.name = ""
        LOG.debug(f"queuing merlin expansion task")
        sig = add_merlin_expanded_chain_to_chord.s(
            task_type,
            steps,
            samples,
            labels,
            sample_index,
            adapter_config,
            0,
        )
        sig.set(queue=steps[0].get_task_queue())
        if self.request.is_eager:
            sig.delay()
        else:
            self.add_to_chord(sig, lazy=False)
        LOG.debug(f"merlin expansion task queued")
    else:
        LOG.debug(f"queuing simple chain task")
        add_simple_chain_to_chord(self, task_type, steps, adapter_config)
        LOG.debug(f"simple chain task queued")