def test_sample_index(): """Run through some basic testing of the SampleIndex class.""" tests = [ (10, 1, []), (10, 3, []), (11, 2, [5]), (10, 3, [3]), (10, 3, [1]), (10, 1, [3]), (10, 3, [1, 3]), (10, 1, [2]), (1000, 100, [500]), (1000, 50, [500, 100]), (1000000000, 100000132, []), ] for args in tests: print(f"############ TEST {args[0]} {args[1]} {args[2]} ###########") # put at root address of "0" to guarantee insertion at "0.1" later is valid idx = create_hierarchy(args[0], args[1], args[2], address="0") print(str(idx)) try: idx["0.1"] = create_hierarchy(args[0], args[1], args[2], address="0.1") print("successful set") print(str(idx)) except KeyError as error: print(error) assert False
def test_directory_writing(): path = os.path.join(TEST_DIR) indx = create_hierarchy(2, 1, [1], root=path) expected = """: DIRECTORY MIN 0 MAX 2 NUM_BUNDLES 2 0: DIRECTORY MIN 0 MAX 1 NUM_BUNDLES 1 0.0: BUNDLE 0 MIN 0 MAX 1 1: DIRECTORY MIN 1 MAX 2 NUM_BUNDLES 1 1.0: BUNDLE 1 MIN 1 MAX 2 """ assert expected == str(indx) indx.write_directories() assert os.path.isdir(f"{TEST_DIR}/0") assert os.path.isdir(f"{TEST_DIR}/1") indx.write_multiple_sample_index_files() clear_test_tree() path = os.path.join(TEST_DIR) indx = create_hierarchy(1000000000, 10000, [100000000, 10000000], root=path) indx.write_directories() path = indx.get_path_to_sample(123000123) assert os.path.exists(os.path.dirname(path)) assert path != TEST_DIR path = indx.get_path_to_sample(10000000000) assert path == TEST_DIR clear_test_tree() path = os.path.join(TEST_DIR) indx = create_hierarchy(1000000000, 10000, [100000000, 10000000, 1000000], root=path) indx.write_directories()
def test_directory_path(): indx = create_hierarchy(20, 1, [20, 5, 1], root="") leaves = indx.make_directory_string() expected_leaves = "0/0/0 0/0/1 0/0/2 0/0/3 0/0/4 0/1/0 0/1/1 0/1/2 0/1/3 0/1/4 0/2/0 0/2/1 0/2/2 0/2/3 0/2/4 0/3/0 0/3/1 0/3/2 0/3/3 0/3/4" assert leaves == expected_leaves all_dirs = indx.make_directory_string(just_leaf_directories=False) expected_all_dirs = " 0 0/0 0/0/0 0/0/1 0/0/2 0/0/3 0/0/4 0/1 0/1/0 0/1/1 0/1/2 0/1/3 0/1/4 0/2 0/2/0 0/2/1 0/2/2 0/2/3 0/2/4 0/3 0/3/0 0/3/1 0/3/2 0/3/3 0/3/4" assert all_dirs == expected_all_dirs
def test_index_file_writing(): indx = create_hierarchy( 1000000000, 10000, [100000000, 10000000, 1000000], root=TEST_DIR ) indx.write_directories() indx.write_multiple_sample_index_files() indx2 = read_hierarchy(TEST_DIR) assert indx2.get_path_to_sample(123000123) == indx.get_path_to_sample(123000123)
def test_bundle_retrieval(): indx = create_hierarchy(1000000000, 10000, [100000000, 10000000, 1000000], root=TEST_DIR) expected = f"{TEST_DIR}/0/0/0/samples0-10000.ext" result = indx.get_path_to_sample(123) assert expected == result expected = f"{TEST_DIR}/0/0/0/samples10000-20000.ext" result = indx.get_path_to_sample(10000) assert expected == result expected = f"{TEST_DIR}/1/2/3/samples123000000-123010000.ext" result = indx.get_path_to_sample(123000123) assert expected == result
def test_subhierarchy_insertion(): indx = create_hierarchy(2, 1, [1], root=TEST_DIR) print("Writing directories") indx.write_directories() indx.write_multiple_sample_index_files() print("reading heirarchy") top = read_hierarchy(os.path.abspath(TEST_DIR)) expected = """: DIRECTORY MIN 0 MAX 2 NUM_BUNDLES 2 0: DIRECTORY MIN 0 MAX 1 NUM_BUNDLES 1 0.0: BUNDLE -1 MIN 0 MAX 1 1: DIRECTORY MIN 1 MAX 2 NUM_BUNDLES 1 1.0: BUNDLE -1 MIN 1 MAX 2 """ assert str(top) == expected print("creating sub_heirarchy") sub_h = create_hierarchy(100, 10, address="1.0") print("inserting sub_heirarchy") top["1.0"] = sub_h print(str(indx)) print("after insertion") print(str(top)) expected = """: DIRECTORY MIN 0 MAX 2 NUM_BUNDLES 2 0: DIRECTORY MIN 0 MAX 1 NUM_BUNDLES 1 0.0: BUNDLE -1 MIN 0 MAX 1 1: DIRECTORY MIN 1 MAX 2 NUM_BUNDLES 1 1.0: DIRECTORY MIN 0 MAX 100 NUM_BUNDLES 10 1.0.0: BUNDLE 0 MIN 0 MAX 10 1.0.1: BUNDLE 1 MIN 10 MAX 20 1.0.2: BUNDLE 2 MIN 20 MAX 30 1.0.3: BUNDLE 3 MIN 30 MAX 40 1.0.4: BUNDLE 4 MIN 40 MAX 50 1.0.5: BUNDLE 5 MIN 50 MAX 60 1.0.6: BUNDLE 6 MIN 60 MAX 70 1.0.7: BUNDLE 7 MIN 70 MAX 80 1.0.8: BUNDLE 8 MIN 80 MAX 90 1.0.9: BUNDLE 9 MIN 90 MAX 100 """ assert str(top) == expected
def test_start_sample_id(): expected = """: DIRECTORY MIN 203 MAX 303 NUM_BUNDLES 10 0: BUNDLE 0 MIN 203 MAX 213 1: BUNDLE 1 MIN 213 MAX 223 2: BUNDLE 2 MIN 223 MAX 233 3: BUNDLE 3 MIN 233 MAX 243 4: BUNDLE 4 MIN 243 MAX 253 5: BUNDLE 5 MIN 253 MAX 263 6: BUNDLE 6 MIN 263 MAX 273 7: BUNDLE 7 MIN 273 MAX 283 8: BUNDLE 8 MIN 283 MAX 293 9: BUNDLE 9 MIN 293 MAX 303 """ idx203 = create_hierarchy(100, 10, start_sample_id=203) assert expected == str(idx203)
def expand_tasks_with_samples( self, dag, chain_, samples, labels, task_type, adapter_config, level_max_dirs, **kwargs, ): """ Generate a group of celery chains of tasks from a chain of task names, using merlin samples and labels to do variable substitution. :param dag : A Merlin DAG. :param chain_ : The list of task names to expand into a celery group of celery chains. :param samples : The list of lists of merlin sample values to do substitution for. :labels : A list of strings containing the label associated with each column in the samples. :task_type : The celery task type to create. Currently always merlin_step. :adapter_config : A dictionary used for configuring maestro script adapters. :level_max_dirs : The max number of directories per level in the sample hierarchy. """ LOG.debug(f"expand_tasks_with_samples called with chain,{chain_}\n") # Figure out how many directories there are, make a glob string directory_sizes = uniform_directories(len(samples), bundle_size=1, level_max_dirs=level_max_dirs) glob_path = "*/" * len(directory_sizes) LOG.debug("creating sample_index") # Write a hierarchy to get the all paths string sample_index = create_hierarchy( len(samples), bundle_size=1, directory_sizes=directory_sizes, root="", n_digits=len(str(level_max_dirs)), ) LOG.debug("creating sample_paths") sample_paths = sample_index.make_directory_string() LOG.debug("assembling steps") # the steps in the chain steps = [dag.step(name) for name in chain_] # sub in globs prior to expansion # sub the glob command steps = [ step.clone_changing_workspace_and_cmd( cmd_replacement_pairs=parameter_substitutions_for_cmd( glob_path, sample_paths)) for step in steps ] # workspaces = [step.get_workspace() for step in steps] # LOG.debug(f"workspaces : {workspaces}") needs_expansion = is_chain_expandable(steps, labels) LOG.debug(f"needs_expansion {needs_expansion}") if needs_expansion: # prepare_chain_workspace(sample_index, steps) sample_index.name = "" LOG.debug(f"queuing merlin expansion tasks") found_tasks = False conditions = [ lambda c: c.is_great_grandparent_of_leaf, lambda c: c.is_grandparent_of_leaf, lambda c: c.is_parent_of_leaf, lambda c: c.is_leaf, ] for condition in conditions: if not found_tasks: for next_index_path, next_index in sample_index.traverse( conditional=condition): LOG.info( f"generating next step for range {next_index.min}:{next_index.max} {next_index.max-next_index.min}" ) next_index.name = next_index_path sig = add_merlin_expanded_chain_to_chord.s( task_type, steps, samples[next_index.min:next_index.max], labels, next_index, adapter_config, next_index.min, ) sig.set(queue=steps[0].get_task_queue()) if self.request.is_eager: sig.delay() else: LOG.info( f"queuing expansion task {next_index.min}:{next_index.max}" ) self.add_to_chord(sig, lazy=False) LOG.info( f"merlin expansion task {next_index.min}:{next_index.max} queued" ) found_tasks = True else: LOG.debug(f"queuing simple chain task") add_simple_chain_to_chord(self, task_type, steps, adapter_config) LOG.debug(f"simple chain task queued")
def expand_tasks_with_samples( self, _, dag, chain_, samples, labels, task_type, adapter_config, level_max_dirs, **kwargs, ): """ Generate a group of celery chains of tasks from a chain of task names, using merlin samples and labels to do variable substitution. :param dag : A Merlin DAG. :param chain_ : The list of task names to expand into a celery group of celery chains. :param samples : The list of lists of merlin sample values to do substitution for. :labels : A list of strings containing the label associated with each column in the samples. :task_type : The celery task type to create. Currently always merlin_step. :adapter_config : A dictionary used for configuring maestro script adapters. :level_max_dirs : The max number of directories per level in the sample hierarchy. """ LOG.debug(f"expand_tasks_with_samples called with chain,{chain_}\n") # Figure out how many directories there are, make a glob string directory_sizes = uniform_directories(len(samples), bundle_size=1, level_max_dirs=level_max_dirs) directory_sizes.append(1) glob_path = "*/" * len(directory_sizes) # Write a hierarchy to get the all paths string sample_index = create_hierarchy(len(samples), bundle_size=1, directory_sizes=directory_sizes, root="") sample_paths = sample_index.make_directory_string() # the steps in the chain steps = [dag.step(name) for name in chain_] # sub in globs prior to expansion # sub the glob command steps = [ step.clone_changing_workspace_and_cmd( cmd_replacement_pairs=parameter_substitutions_for_cmd( glob_path, sample_paths)) for step in steps ] workspaces = [step.get_workspace() for step in steps] LOG.debug(f"workspaces : {workspaces}") needs_expansion = is_chain_expandable(steps, labels) if needs_expansion: prepare_chain_workspace(sample_index, steps) sample_index.name = "" LOG.debug(f"queuing merlin expansion task") sig = add_merlin_expanded_chain_to_chord.s( task_type, steps, samples, labels, sample_index, adapter_config, 0, ) sig.set(queue=steps[0].get_task_queue()) if self.request.is_eager: sig.delay() else: self.add_to_chord(sig, lazy=False) LOG.debug(f"merlin expansion task queued") else: LOG.debug(f"queuing simple chain task") add_simple_chain_to_chord(self, task_type, steps, adapter_config) LOG.debug(f"simple chain task queued")