def _maybe_queue_children(parent_app_name, parent_job_id): """ This is basically a "set_state(completed=True)" pre-commit hook Assume the task identified by (parent_app_name, parent_job_id) is completed, and for each of that parent's children in the dag graph of tasks, set 1/num_parents worth of points towards that child's completion. If any one child has earned 1 point, then add it to its task queue We track the "score" of a child by counting files in the job path: .../parents/dependency_name/parent_app_name/parent_job_id """ qbcli = shared.get_qbclient() gen = dt.get_children(parent_app_name, parent_job_id, True) for child_app_name, cjob_id, dep_grp in gen: ld = dict(child_app_name=child_app_name, child_job_id=cjob_id, app_name=parent_app_name, job_id=parent_job_id) ptotal = len(list(dt.get_parents(child_app_name, cjob_id))) pcomplete = qbcli.increment( _path_num_complete_parents(child_app_name, cjob_id)) if (pcomplete >= ptotal): log.info("Parent is queuing a child task", extra=ld) if pcomplete > ptotal: log.warn( "For some reason, I calculated that more parents" " completed than there are parents." " If you aren't re-adding tasks, this could be a code bug" " that results in tasks unnecessarily sitting in queue.", extra=dict(num_complete_dependencies=pcomplete, num_total_dependencies=ptotal, **ld)) if check_state(child_app_name, cjob_id, completed=True): log.warn( "Queuing a previously completed child task" " presumably because of the following:" " 1) you manually queued both a" " parent/ancestor and the child," " and 2) the child completed first." " You probably shouldn't manually re-queue both parents" " and children. Just queue one of them.", extra=ld) try: readd_subtask( child_app_name, cjob_id, _reset_descendants=False, # descendants previously handled _ignore_if_queued=True) except exceptions.JobAlreadyQueued: log.info("Child already in queue", extra=dict(**ld)) raise elif (pcomplete < ptotal): log.info("Child job one step closer to being queued!", extra=dict(num_complete_dependencies=pcomplete, num_total_dependencies=ptotal, **ld))
def test_autofill_get_parents(autofill1, autofill2, autofill_getparents): # test when child job_id is a superset of parents and depends_on only # defines 2+ app_names (where each app has different job_id templates) # omg this is specific! nt.assert_items_equal( list(dag_tools.get_parents(autofill_getparents, '20150101_10_10')), [(autofill1, '10'), (autofill2, '20150101')] )
def test_get_parents_with_complicated_job_ids( func_name, valid1, valid2, valid3, valid3b, valid4): nt.assert_items_equal( list(dag_tools.get_parents(valid3, '20151015_100')), [] ) nt.assert_items_equal( list(dag_tools.get_parents(valid3, '20151015_101')), [ (valid1, '20151015_1'), (valid1, '20151015_2'), (valid2, '20151015_101') ] ) # valid3b should be same as valid3 nt.assert_items_equal( list(dag_tools.get_parents(valid3b, '20151015_100')), list(dag_tools.get_parents(valid3, '20151015_100')), ) nt.assert_items_equal( list(dag_tools.get_parents(valid4, '20151015_100')), []) nt.assert_items_equal( list(dag_tools.get_parents(valid4, '20151015_102')), []) nt.assert_items_equal( list(dag_tools.get_parents(valid4, '20151015_101')), [ (valid1, '20151015_1'), (valid1, '20151015_2'), (valid2, '20151015_101') ] )
def test_autofill_all(func_name, autofill1, autofill2, autofill3, autofill_getparents): # autofill1 out of bounds get_children nt.assert_items_equal(list(dag_tools.get_children(autofill1, '9')), []) nt.assert_items_equal(list(dag_tools.get_children(autofill1, '11')), []) nt.assert_items_equal(list(dag_tools.get_children(autofill1, '20')), []) # autofill1 in bounds get_children nt.assert_items_equal( list(dag_tools.get_children(autofill1, '10')), [ (autofill3, '20150101', 'default'), (autofill3, '20150102', 'default'), (autofill3, '20150103', 'default'), (autofill3, '20150104', 'default'), (autofill_getparents, '20150101_10_10', 'default'), ]) # autofill2 out of bounds get_children nt.assert_items_equal( list(dag_tools.get_children(autofill2, '20150128')), []) # autofill2 in bounds get_children nt.assert_items_equal( list(dag_tools.get_children(autofill2, '20150101')), [ (autofill3, '20150101', 'default'), (autofill_getparents, '20150101_10_10', 'default') ]) # autofill3 get parents out and in bounds nt.assert_items_equal( list(dag_tools.get_children(autofill3, '20150101')), []) nt.assert_items_equal( list(dag_tools.get_parents(autofill3, '20150128')), []) nt.assert_items_equal( list(dag_tools.get_parents(autofill3, '20150101')), [ (autofill1, '10'), (autofill1, '12'), (autofill1, '14'), (autofill1, '16'), (autofill1, '18'), (autofill2, '20150101'), ])
def test_fan_out_tasks(app1, app2, app4, fanout1, func_name): # test for Many-to-Many relationships between parent and child tasks nt.assert_count_equal( list(dag_tools.get_parents( 'test_stolos/test_fan_out_tasks/fanout1', '20140715_8')), []) nt.assert_count_equal( list(dag_tools.get_parents( 'test_stolos/test_fan_out_tasks/fanout1', '20140715_testID5-%s' % func_name, True)), [ (app1, '20140714_555_profile-%s' % func_name, u'dep2'), (app1, '20140715_555_profile-%s' % func_name, u'dep2'), ]) nt.assert_count_equal( list(dag_tools.get_children( 'test_stolos/test_fan_out_tasks/app1', '20140715_9_profile-%s' % func_name, True,)), [(app2, '20140715_9_profile-%s' % func_name, 'default'), (app4, '20140715_9_profile-%s' % func_name, 'default'), (fanout1, '20140715_testID1-%s' % func_name, u'dep1'), (fanout1, '20140715_testID2-%s' % func_name, u'dep1'), (fanout1, '20140715_testID3-%s' % func_name, u'dep1'), ]) nt.assert_count_equal( list(dag_tools.get_children( app1, '20140715_555_profile-%s' % func_name, True,)), [ (app2, '20140715_555_profile-%s' % func_name, 'default'), (app4, '20140715_555_profile-%s' % func_name, 'default'), (fanout1, u'20140714_testID5-%s' % func_name, u'dep2'), (fanout1, u'20140714_testID6-%s' % func_name, u'dep2'), (fanout1, u'20140715_testID1-%s' % func_name, u'dep1'), (fanout1, u'20140715_testID2-%s' % func_name, u'dep1'), (fanout1, u'20140715_testID3-%s' % func_name, u'dep1'), (fanout1, u'20140715_testID5-%s' % func_name, u'dep2'), (fanout1, u'20140715_testID6-%s' % func_name, u'dep2'), ])
def test_topological_sort(topological_sort1, app1, app2, depends_on1, bash2, depends_on_job_id1, func_name): nt.assert_count_equal( list(dag_tools.topological_sort(dag_tools.get_parents( topological_sort1, depends_on_job_id1, True,))), [ (app1, '20140601_101_profile-%s' % func_name, u'dep1'), (app1, '20140601_102_profile-%s' % func_name, u'dep1'), (app2, '20140601_101_profile-%s' % func_name, u'dep1'), (app2, '20140601_102_profile-%s' % func_name, u'dep1'), (depends_on1, u'20140601_testID1-%s' % func_name, u'dep1'), (bash2, '20140601_101_profile-%s' % func_name, u'dep1'), (bash2, '20140601_102_profile-%s' % func_name, u'dep1') ] )
def ensure_parents_completed(app_name, job_id): """ Assume that given job_id is pulled from the app_name's queue. Check that the parent tasks for this (app_name, job_id) pair have completed If they haven't completed and aren't pending, maybe create the parent task in its appropriate queue. Also decide whether the calling process should requeue given job_id or remove itself from queue. Because this needs to happen as one transaction, also return a list of execute locks that the calling code must release after it decides how to handle the current job_id. Returns a tuple: (are_parents_completed, should_job_id_be_consumed_from_queue, parent_execute_locks_to_release) """ parents_completed = True consume_queue = False parent_lock = None for parent, pjob_id, dep_grp in dt.get_parents(app_name, job_id, True): if check_state(app_name=parent, job_id=pjob_id, completed=True): continue parents_completed = False log.info('My parent has not completed yet.', extra=dict(parent_app_name=parent, parent_job_id=pjob_id, app_name=app_name, job_id=job_id)) # At this point, I need to be re-run # The question at this point is whether to requeue myself or assume the # parent will. # Assume the default is I requeue myself. Sometimes, this might result # in me cycling through the queue a couple times until parent finishes. # If parent is running, it will be able to requeue me if I exit in # time. If it doesn't, either I'll requeue myself by default or # another parent will. So, do nothing in this case. # if parent is not running, I should try to maybe_add_subtask it. # - if can't add parent, then possibly something else is adding it, or # it ran once and is waiting on one of my grandparents. # - if I can maybe_add_subtask my parent, then it definitely wasn't # running before. # In both cases, # I should try to unqueue myself if I can guarantee that the parent # won't run by the time I unqueue myself. Otherwise, I should just # default to assuming parent is running and requeue myself by default. added = maybe_add_subtask(parent, pjob_id) # if parent marked 'skipped' and then someone calls a maybe_add_subtask # on the child, child could requeue itself indefinitely. to prevent, # child should unqueue itself and raise error complaint that for some # insane reason it's running but it's parent is "skipped" if not added and check_state(parent, pjob_id, skipped=True): consume_queue = True # raise some sort of error log.warn( "My parent_job_id is marked as 'skipped'," " so should be impossible for me, the child, to exist!" " Requesting to unqueue myself. This is odd.", extra=dict(parent_app_name=parent, parent_job_id=pjob_id, app_name=app_name, job_id=job_id)) break if parent_lock is not None: continue # we already found a parent that promises to requeue me elock = obtain_execute_lock(parent, pjob_id, raise_on_error=False, blocking=False) if elock: if not check_state(parent, pjob_id, pending=True): elock.release() # race condition: parent just did something! else: consume_queue = True parent_lock = elock log.info( "I will unqueue myself with the expectation that" " my parent will requeue me", extra=dict(app_name=app_name, job_id=job_id)) return parents_completed, consume_queue, parent_lock
def test_depends_on_all(func_name, all_test1, all_test2, all_test3, all_test4, all_test4b, all_test5): # all_test1 children nt.assert_items_equal( list(dag_tools.get_children( all_test1, '20140601_1', True,)), [ (all_test3, '20140601_1', 'default'), (all_test3, '20140601_2', 'default'), (all_test4, '20140601', 'default'), (all_test4b, '20140601', 'both'), (all_test5, '20140601', 'both_apps'), ]) nt.assert_items_equal( list(dag_tools.get_children( all_test1, '20140601_1', True,)), list(dag_tools.get_children( all_test1, '20140601_2', True,)) ) nt.assert_items_equal( list(dag_tools.get_children( all_test1, '20140601_0', True,)), []) nt.assert_items_equal( list(dag_tools.get_children( all_test1, '20140601_0', True,)), []) # all_test2 children nt.assert_items_equal( list(dag_tools.get_children( all_test2, '20140601_0', True,)), []) nt.assert_items_equal( list(dag_tools.get_children( all_test2, '20140601_1', True,)), []) nt.assert_items_equal( list(dag_tools.get_children( all_test2, '20140601_2', True,)), [ (all_test4, '20140601', 'default'), (all_test4b, '20140601', 'both') ]) nt.assert_items_equal( list(dag_tools.get_children( all_test2, '20140601_3', True,)), [ (all_test4, '20140601', 'default'), (all_test4b, '20140601', 'both') ]) nt.assert_items_equal( list(dag_tools.get_children( all_test2, '20140601_4', True,)), [(all_test5, '20140601', 'both_apps')]) # all_test3 parents nt.assert_items_equal( list(dag_tools.get_parents(all_test3, "20140601_0", True)), []) nt.assert_items_equal( list(dag_tools.get_parents(all_test3, "20140601_1", True)), [ (all_test1, '20140601_1', 'default'), (all_test1, '20140601_2', 'default'), ]) nt.assert_items_equal( list(dag_tools.get_parents(all_test3, "20140601_2", True)), list(dag_tools.get_parents(all_test3, "20140601_1", True)), ) nt.assert_items_equal( list(dag_tools.get_parents(all_test3, "20140601_3", True)), []) # all_test4 parents nt.assert_items_equal( list(dag_tools.get_parents(all_test4, "20140601", True)), [ (all_test1, '20140601_1', 'default'), (all_test1, '20140601_2', 'default'), (all_test2, '20140601_2', 'default'), (all_test2, '20140601_3', 'default'), ]) nt.assert_items_equal( list(dag_tools.get_parents(all_test4, "20140601", True)), list(dag_tools.get_parents(all_test4, "20140601", True)), ) # all_test5 parents nt.assert_items_equal( list(dag_tools.get_parents(all_test5, "20140601", True)), [ (all_test1, '20140601_1', 'both_apps'), (all_test1, '20140601_2', 'both_apps'), (all_test2, '20140601_4', 'both_apps') ])
def test_get_parents(app1, app2, depends_on1, depends_on2, bash1, bash2, depends_on_job_id1, func_name): # test case with no parents nt.assert_equal( list(dag_tools.get_parents(app1, '20140101_876_purchase', True)), [] ) # test the basic inheritance scenario nt.assert_count_equal( list(dag_tools.get_parents(bash2, '20140501_876_profile', True)), [(bash1, '20140501_876_profile', 'default')] ) # test invalid job_id nt.assert_count_equal( list(dag_tools.get_parents(depends_on1, '20140101_999999', True)), [] ) # test invalid metadata in job_id nt.assert_count_equal( list(dag_tools.get_parents(depends_on1, '20140601_999', True)), [] ) # test depends_on for one of the dependency groups nt.assert_count_equal( list(dag_tools.get_parents( depends_on1, '20140601_testID2-%s' % func_name, True)), [ (depends_on2, '20140601_1011_profile-%s' % func_name, u'depgrp2'), (depends_on2, '20140601_9020_profile-%s' % func_name, u'depgrp2'), (depends_on2, '20140601_876_profile-%s' % func_name, u'depgrp2') ]) # test depends_on for one of the dependency groups # also tests that get_parents returns a stable ordering nt.assert_count_equal( list(dag_tools.get_parents(depends_on1, depends_on_job_id1, True)), [ (app1, '20140601_1011_profile-%s' % func_name, u'depgrp1'), (app1, '20140601_1011_purchase-%s' % func_name, u'depgrp1'), (app1, '20140601_9020_profile-%s' % func_name, u'depgrp1'), (app1, '20140601_9020_purchase-%s' % func_name, u'depgrp1'), (app1, '20140601_876_profile-%s' % func_name, u'depgrp1'), (app1, '20140601_876_purchase-%s' % func_name, u'depgrp1'), (app1, '20140601_999_purchase-%s' % func_name, u'depgrp1'), (app2, '20140601_1011_profile-%s' % func_name, u'depgrp1'), (app2, '20140601_1011_purchase-%s' % func_name, u'depgrp1'), (app2, '20140601_9020_profile-%s' % func_name, u'depgrp1'), (app2, '20140601_9020_purchase-%s' % func_name, u'depgrp1'), (app2, '20140601_876_profile-%s' % func_name, u'depgrp1'), (app2, '20140601_876_purchase-%s' % func_name, u'depgrp1') ] ) # test depends_on when multiple dependency groups map to the same job_id # I guess it's okay if they map to the same id? nt.assert_count_equal( list(dag_tools.get_parents( depends_on1, '20140601_testID3-%s' % func_name, True)), [(app1, '20140601_444_profile-%s' % func_name, u'depgrp4'), (app1, '20140601_876_profile-%s' % func_name, u'depgrp3'), ] ) # test the filter_deps option nt.assert_count_equal( list(dag_tools.get_parents( depends_on1, '20140601_testID3-%s' % func_name, True, filter_deps=['depgrp4'])), [(app1, '20140601_444_profile-%s' % func_name, u'depgrp4')] ) with nt.assert_raises(exceptions.DAGMisconfigured): list(dag_tools.get_parents( depends_on1, '20140601_testID3-%s' % func_name, True, filter_deps=['depgrp99999']))
def ensure_parents_completed(app_name, job_id): """ Assume that given job_id is pulled from the app_name's queue. Check that the parent tasks for this (app_name, job_id) pair have completed If they haven't completed and aren't pending, maybe create the parent task in its appropriate queue. Also decide whether the calling process should requeue given job_id or remove itself from queue. Because this needs to happen as one transaction, also return a list of execute locks that the calling code must release after it decides how to handle the current job_id. Returns a tuple: (are_parents_completed, should_job_id_be_consumed_from_queue, parent_execute_locks_to_release) """ parents_completed = True consume_queue = False parent_lock = None for parent, pjob_id, dep_grp in dt.get_parents(app_name, job_id, True): if check_state(app_name=parent, job_id=pjob_id, completed=True): continue parents_completed = False log.info( 'My parent has not completed yet.', extra=dict( parent_app_name=parent, parent_job_id=pjob_id, app_name=app_name, job_id=job_id)) # At this point, I need to be re-run # The question at this point is whether to requeue myself or assume the # parent will. # Assume the default is I requeue myself. Sometimes, this might result # in me cycling through the queue a couple times until parent finishes. # If parent is running, it will be able to requeue me if I exit in # time. If it doesn't, either I'll requeue myself by default or # another parent will. So, do nothing in this case. # if parent is not running, I should try to maybe_add_subtask it. # - if can't add parent, then possibly something else is adding it, or # it ran once and is waiting on one of my grandparents. # - if I can maybe_add_subtask my parent, then it definitely wasn't # running before. # In both cases, # I should try to unqueue myself if I can guarantee that the parent # won't run by the time I unqueue myself. Otherwise, I should just # default to assuming parent is running and requeue myself by default. added = maybe_add_subtask(parent, pjob_id) # if parent marked 'skipped' and then someone calls a maybe_add_subtask # on the child, child could requeue itself indefinitely. to prevent, # child should unqueue itself and raise error complaint that for some # insane reason it's running but it's parent is "skipped" if not added and check_state(parent, pjob_id, skipped=True): consume_queue = True # raise some sort of error log.warn( "My parent_job_id is marked as 'skipped'," " so should be impossible for me, the child, to exist!" " Requesting to unqueue myself. This is odd.", extra=dict( parent_app_name=parent, parent_job_id=pjob_id, app_name=app_name, job_id=job_id)) break if parent_lock is not None: continue # we already found a parent that promises to requeue me elock = obtain_execute_lock( parent, pjob_id, raise_on_error=False, blocking=False) if elock: if not check_state(parent, pjob_id, pending=True): elock.release() # race condition: parent just did something! else: consume_queue = True parent_lock = elock log.info( "I will unqueue myself with the expectation that" " my parent will requeue me", extra=dict( app_name=app_name, job_id=job_id)) return parents_completed, consume_queue, parent_lock
def _maybe_queue_children(parent_app_name, parent_job_id): """ This is basically a "set_state(completed=True)" pre-commit hook Assume the task identified by (parent_app_name, parent_job_id) is completed, and for each of that parent's children in the dag graph of tasks, set 1/num_parents worth of points towards that child's completion. If any one child has earned 1 point, then add it to its task queue We track the "score" of a child by counting files in the job path: .../parents/dependency_name/parent_app_name/parent_job_id """ qbcli = shared.get_qbclient() gen = dt.get_children(parent_app_name, parent_job_id, True) for child_app_name, cjob_id, dep_grp in gen: ld = dict( child_app_name=child_app_name, child_job_id=cjob_id, app_name=parent_app_name, job_id=parent_job_id) ptotal = len(list(dt.get_parents(child_app_name, cjob_id))) pcomplete = qbcli.increment( _path_num_complete_parents(child_app_name, cjob_id)) if (pcomplete >= ptotal): log.info( "Parent is queuing a child task", extra=ld) if pcomplete > ptotal: log.warn( "For some reason, I calculated that more parents" " completed than there are parents." " If you aren't re-adding tasks, this could be a code bug" " that results in tasks unnecessarily sitting in queue.", extra=dict( num_complete_dependencies=pcomplete, num_total_dependencies=ptotal, **ld)) if check_state(child_app_name, cjob_id, completed=True): log.warn( "Queuing a previously completed child task" " presumably because of the following:" " 1) you manually queued both a" " parent/ancestor and the child," " and 2) the child completed first." " You probably shouldn't manually re-queue both parents" " and children. Just queue one of them.", extra=ld) try: readd_subtask( child_app_name, cjob_id, _reset_descendants=False, # descendants previously handled _ignore_if_queued=True) except exceptions.JobAlreadyQueued: log.info("Child already in queue", extra=dict(**ld)) raise elif (pcomplete < ptotal): log.info( "Child job one step closer to being queued!", extra=dict( num_complete_dependencies=pcomplete, num_total_dependencies=ptotal, **ld))