def _build_dict_deps(dg, app_name, deps): """Build edges between dependent nodes by looking at listed dependencies `dg` (nx.MultiDiGraph instance) - the Tasks configuration as a graph `app_name` (str) - the name of a scheduled application `deps` (obj) - the dependencies for given `app_name`. Should be a subclass of cb.TasksConfigBaseMapping, and is the value of given app_name's "depends_on" field """ log_details = dict(app_name=app_name, key='depends_on', deps=dict(deps)) if isinstance(deps, cb.TasksConfigBaseMapping) and "app_name" in deps: _add_edges( dg, app_name=app_name, dep_name=get_NS().dependency_group_default_name, dep_grp=deps, log_details=log_details) else: for dep_name, dep_data in deps.items(): if isinstance(dep_data, cb.TasksConfigBaseMapping): _add_edges( dg=dg, app_name=app_name, dep_name=dep_name, dep_grp=dep_data, log_details=log_details) elif isinstance(dep_data, cb.TasksConfigBaseSequence): for _dep_grp in dep_data: _add_edges( dg=dg, app_name=app_name, dep_name=dep_name, dep_grp=_dep_grp, log_details=log_details) else: _log_raise( "Unrecognized dependency. Expected a list or dict", dict(dep_name=dep_name, dep_data=dep_data, **log_details), exception_kls=DAGMisconfigured)
def _inplace_modify_depends_on(dep_group, child_app_name, child_job_id, parent_app_name, ld): """Given metadata about a dependency group, set the dep_group['job_id'] value. Assume the dependency group only specifies an app_name key. Also, if the field for each identifier in the current job_id does not exist in the dependency group, add it. Basically, just update the dependency group with information """ # if only "app_name" is defined in this dependency group, # assume child inherited the parent's job_id and passed that # to this child if child_job_id is None: _log_raise( ("It's impossible to get all parent job_ids if the" " child expects to inherit the parent's job_id and you" " haven't specified the child's job_id"), extra=dict(parent_app_name=parent_app_name, **ld), exception_kls=DAGMisconfigured) pjob_id = parse_job_id(child_app_name, child_job_id) if len(dep_group) == 1 and len(dep_group['app_name']) == 1: t, pt = get_job_id_template(parent_app_name) try: dep_group['job_id'] = [t.format(**pjob_id)] except Exception as err: _log_raise( ("The child job_id doesn't contain enough pjob_id data to" " create the parent job_id. Err details: %s") % err, extra=dict(job_id_template=t, pjob_iddata=str(pjob_id), **ld), exception_kls=err.__class__) else: for k, v in pjob_id.items(): if k not in dep_group: dep_group[k] = [v]
def validate_if_or(app_name1, metadata, dg, tasks_conf, ld): # valid_if_or - are we specifying what makes a job valid correctly? if 'valid_if_or' not in metadata: return for k, v in metadata['valid_if_or'].items(): if k == '_func': continue location = "%s.valid_if_or.%s" % (app_name1, k) _log_raise_if( not isinstance(v, cb.TasksConfigBaseSequence), "Task is misconfigured. Wrong value type. Expected a sequence", extra=dict(wrong_value_type=type(v), key=location, **ld), exception_kls=DAGMisconfigured) templ = node.get_job_id_template(app_name1)[1] _log_raise_if( k not in templ, "valid_if_or contains a key that isn't in its job_id template", extra=dict(key=k, job_id_template=templ, **ld), exception_kls=DAGMisconfigured) try: validation_func = get_NS().job_id_validations[k] except KeyError: continue for vv in v: try: validation_func(vv) except Exception as err: _log_raise( ("valid_if_or contains a value that wasn't validated" " by your job_id_validations. err: %s(%s)") % (err.__class__, err), extra=dict(key=location, wrong_value_type=type(vv), **ld), exception_kls=DAGMisconfigured)
def _add_edges(dg, app_name, dep_name, dep_grp, log_details): """Add edge(s) to a networkx graph instance `dg` is an instance of a nx.MultiDiGraph, which means we can have multiple edges between two nodes `dep_name` (str) - the name of a dependency group `dep_grp` (obj) - dependency group data. Subclass of cb.TasksConfigBaseMapping. An example of what this may look like is: dep_grp = { "app_name": ["test_app"], "date": [20140601], "client_id": [123, 140, 150], ... } """ try: parent = dep_grp['app_name'] except (KeyError, TypeError): raise DAGMisconfigured( "You defined a dependency but forgot to include the app_name") if isinstance(parent, six.string_types): dg.add_edge(parent, app_name, key=dep_name, label=dep_name) elif isinstance(parent, cb.TasksConfigBaseSequence): for _parent in parent: dg.add_edge(_parent, app_name, key=dep_name, label=dep_name) else: _log_raise(( "Unrecognized type:" " I found a child that doesn't properly define parents." " Children should have the parent app_name" " define a string or sequence" " of strings that represent the child's parents."), dict(parent_app_name=parent, parent_app_name_type=type(parent), **log_details), exception_kls=DAGMisconfigured)
def _add_edges(dg, app_name, dep_name, dep_grp, log_details): """Add edge(s) to a networkx graph instance `dg` is an instance of a nx.MultiDiGraph, which means we can have multiple edges between two nodes `dep_name` (str) - the name of a dependency group `dep_grp` (obj) - dependency group data. Subclass of cb.TasksConfigBaseMapping. An example of what this may look like is: dep_grp = { "app_name": ["test_app"], "date": [20140601], "client_id": [123, 140, 150], ... } """ try: parent = dep_grp['app_name'] except (KeyError, TypeError): raise DAGMisconfigured( "You defined a dependency but forgot to include the app_name") if isinstance(parent, six.string_types): dg.add_edge(parent, app_name, key=dep_name, label=dep_name) elif isinstance(parent, cb.TasksConfigBaseSequence): for _parent in parent: dg.add_edge(_parent, app_name, key=dep_name, label=dep_name) else: _log_raise(("Unrecognized type:" " I found a child that doesn't properly define parents." " Children should have the parent app_name" " define a string or sequence" " of strings that represent the child's parents."), dict(parent_app_name=parent, parent_app_name_type=type(parent), **log_details), exception_kls=DAGMisconfigured)
def parse_job_id(app_name, job_id, delimiter=None): """Convert given `job_id` into a dict `app_name` (str) identifies a task `job_id` (str) identifies an instance of a task (ie a subtask) `validations` (dict) functions to ensure parts of the job_id are properly typed `job_id_template` (str) identifies which validations to apply `delimiter` (str) value to split job_id into different components ie: 20140506_876_profile --> {'date': 20140506, 'client_id': 876, 'collection_name': 'profile'} Returned values are cast into the appropriate type by the validations funcs """ if delimiter is None: delimiter = get_NS().job_id_delimiter template, ptemplate = get_job_id_template(app_name) vals = job_id.split(delimiter, len(ptemplate) - 1) ld = dict(job_id=job_id, app_name=app_name, job_id_template=template) if len(vals) != len(ptemplate): _log_raise( ("Job_id isn't properly delimited. You might have too few" " or too many underscores."), extra=ld, exception_kls=InvalidJobId, ) return _validate_job_id_identifiers(app_name, vals)
def _validate_dep_grp_with_job_id_validations(dep_grp, ld, tasks_conf): """Do the user defined job_id validations, if they exist, apply to each individual value of the relevant key in the dep group?""" for k, v in dep_grp.items(): # don't do validation on depends_on."app_name" field here, # and not for the depends_on."job_id" either # These fields are the only two fields in depends_on that are # not job_id components if k in ["app_name", "job_id"]: continue func = get_NS().job_id_validations.get(k) # ensure that job_id validations are fully specified for keys in # depends_on sections _log_raise_if( not func, "You introduced a new job_id component in a" " <app_name>.depends_on.<key> subsection, and you must inform" " Stolos how to parse the component", extra=dict(key=k, value=v, **ld), exception_kls=DAGMisconfigured) # skip rest of validations if "all" is used if v == "all": # assert that autofill_values exists on all parents msg = ( " You requested that child depends on \"all\" values for some" " part of its parent job_id_template. If you do this," " the parent must define" " <parent_app_name>.autofill_values.<key>") for parent in dep_grp['app_name']: _log_raise_if(k not in tasks_conf[parent].get( 'autofill_values', {}), msg, extra=dict(parent_app_name=parent, key=k, **ld), exception_kls=DAGMisconfigured) continue for vv in v: try: res = func(vv) except Exception as err: _log_raise( ("Invalid data at <app_name>.depends_on.<key>.[nth_value]." " The job_id_validation function complained that the" " value was invalid. Error details: %s") % err.message, extra=dict(key='%s.%s' % (k, v), value=vv, **ld), exception_kls=DAGMisconfigured) _log_raise_if( vv != res, ("A job_id_validation func just returned a modified" " value. It should return input unmodified or fail."), extra=dict(key='%s.%s' % (k, v), value=vv, job_id_validation=func, **ld), exception_kls=DAGMisconfigured)
def _inplace_modify_depends_on(dep_group, child_app_name, child_job_id, parent_app_name, ld): """Given metadata about a dependency group, set the dep_group['job_id'] value. Assume the dependency group only specifies an app_name key. Also, if the field for each identifier in the current job_id does not exist in the dependency group, add it. Basically, just update the dependency group with information """ # if only "app_name" is defined in this dependency group, # assume child inherited the parent's job_id and passed that # to this child if child_job_id is None: _log_raise(("It's impossible to get all parent job_ids if the" " child expects to inherit the parent's job_id and you" " haven't specified the child's job_id"), extra=dict(parent_app_name=parent_app_name, **ld), exception_kls=DAGMisconfigured) pjob_id = parse_job_id(child_app_name, child_job_id) if len(dep_group) == 1 and len(dep_group['app_name']) == 1: t, pt = get_job_id_template(parent_app_name) try: dep_group['job_id'] = [t.format(**pjob_id)] except Exception as err: _log_raise( ("The child job_id doesn't contain enough pjob_id data to" " create the parent job_id. Err details: %s") % err, extra=dict(job_id_template=t, pjob_iddata=str(pjob_id), **ld), exception_kls=err.__class__) else: for k, v in pjob_id.items(): if k not in dep_group: dep_group[k] = [v]
def parse_job_id(app_name, job_id, delimiter=None): """Convert given `job_id` into a dict `app_name` (str) identifies a task `job_id` (str) identifies an instance of a task (ie a subtask) `validations` (dict) functions to ensure parts of the job_id are properly typed `job_id_template` (str) identifies which validations to apply `delimiter` (str) value to split job_id into different components ie: 20140506_876_profile --> {'date': 20140506, 'client_id': 876, 'collection_name': 'profile'} Returned values are cast into the appropriate type by the validations funcs """ if delimiter is None: delimiter = get_NS().job_id_delimiter template, ptemplate = get_job_id_template(app_name) vals = job_id.split(delimiter, len(ptemplate) - 1) ld = dict(job_id=job_id, app_name=app_name, job_id_template=template) if len(vals) != len(ptemplate): _log_raise(("Job_id isn't properly delimited. You might have too few" " or too many underscores."), extra=ld, exception_kls=InvalidJobId) return _validate_job_id_identifiers(app_name, vals)
def _validate_dep_grp_with_job_id_validations(dep_grp, ld, tasks_conf): """Do the user defined job_id validations, if they exist, apply to each individual value of the relevant key in the dep group?""" for k, v in dep_grp.items(): # don't do validation on depends_on."app_name" field here, # and not for the depends_on."job_id" either # These fields are the only two fields in depends_on that are # not job_id components if k in ["app_name", "job_id"]: continue func = get_NS().job_id_validations.get(k) # ensure that job_id validations are fully specified for keys in # depends_on sections _log_raise_if( not func, "You introduced a new job_id component in a" " <app_name>.depends_on.<key> subsection, and you must inform" " Stolos how to parse the component", extra=dict( key=k, value=v, **ld), exception_kls=DAGMisconfigured) # skip rest of validations if "all" is used if v == "all": # assert that autofill_values exists on all parents msg = ( " You requested that child depends on \"all\" values for some" " part of its parent job_id_template. If you do this," " the parent must define" " <parent_app_name>.autofill_values.<key>") for parent in dep_grp['app_name']: _log_raise_if( k not in tasks_conf[parent].get('autofill_values', {}), msg, extra=dict(parent_app_name=parent, key=k, **ld), exception_kls=DAGMisconfigured) continue for vv in v: try: res = func(vv) except Exception as err: _log_raise(( "Invalid data at <app_name>.depends_on.<key>.[nth_value]." " The job_id_validation function complained that the" " value was invalid. Error details: %s" ) % err.message, extra=dict(key='%s.%s' % (k, v), value=vv, **ld), exception_kls=DAGMisconfigured) _log_raise_if( vv != res, ("A job_id_validation func just returned a modified" " value. It should return input unmodified or fail."), extra=dict( key='%s.%s' % (k, v), value=vv, job_id_validation=func, **ld), exception_kls=DAGMisconfigured)
def _iter_job_ids(dep_group, group_name, parent_app_name, ld): """ Assume there specific job_ids listed in dependency group metadata that the child would inherit from and yield those. """ for jid in dep_group['job_id']: try: parse_job_id(parent_app_name, jid) except InvalidJobId: _ld = dict(**ld) _ld.update(dependency_group_name=group_name, job_id=jid) _log_raise(("There's no way parent could have the child's job_id"), extra=_ld, exception_kls=InvalidJobId) yield (parent_app_name, jid)
def _iter_job_ids(dep_group, group_name, parent_app_name, ld): """ Assume there specific job_ids listed in dependency group metadata that the child would inherit from and yield those. """ for jid in dep_group['job_id']: try: parse_job_id(parent_app_name, jid) except InvalidJobId: _ld = dict(**ld) _ld.update( dependency_group_name=group_name, job_id=jid) _log_raise( ("There's no way parent could have the child's job_id"), extra=_ld, exception_kls=InvalidJobId) yield (parent_app_name, jid)
def passes_filter(app_name, job_id): """Determine if this job matches certain criteria that state it is a valid job for this app_name. A partially out of scope for dag stuff, but important detail: Jobs that don't match the criteria should immediately be marked as completed """ # for now, if we can parse it, it's valid pjob_id = parse_job_id(app_name, job_id) # does this job matches criteria that makes it executable? if so, we can't # autocomplete it dg = cb.get_tasks_config() meta = dg[app_name] ld = dict(app_name=app_name, job_id=job_id) try: dct = dict(meta["valid_if_or"]) except (KeyError, TypeError): return True # everything is valid if "_func" in dct: import_path = dct.pop("_func") # safe because config is immutable try: func = load_obj_from_path(import_path, ld) except Exception as err: raise err.__class__("valid_if_or._func misconfigured: %s" % err.message) if func(app_name, **pjob_id): return True for k, v in dct.items(): try: kk = pjob_id[k] except KeyError: _log_raise( "valid_if_or contains a key that's not in the job_id", extra=dict(valid_if_or_key=k, **ld), exception_kls=DAGMisconfigured, ) vals = [get_NS().job_id_validations[k](x) for x in v] if kk in vals: return True return False
def passes_filter(app_name, job_id): """Determine if this job matches certain criteria that state it is a valid job for this app_name. A partially out of scope for dag stuff, but important detail: Jobs that don't match the criteria should immediately be marked as completed """ # for now, if we can parse it, it's valid pjob_id = parse_job_id(app_name, job_id) # does this job matches criteria that makes it executable? if so, we can't # autocomplete it dg = cb.get_tasks_config() meta = dg[app_name] ld = dict(app_name=app_name, job_id=job_id) try: dct = dict(meta['valid_if_or']) except (KeyError, TypeError): return True # everything is valid if '_func' in dct: import_path = dct.pop('_func') # safe because config is immutable try: func = load_obj_from_path(import_path, ld) except Exception as err: raise err.__class__("valid_if_or._func misconfigured: %s" % err.message) if func(app_name, **pjob_id): return True for k, v in dct.items(): try: kk = pjob_id[k] except KeyError: _log_raise("valid_if_or contains a key that's not in the job_id", extra=dict(valid_if_or_key=k, **ld), exception_kls=DAGMisconfigured) vals = [get_NS().job_id_validations[k](x) for x in v] if kk in vals: return True return False
def _build_dict_deps(dg, app_name, deps): """Build edges between dependent nodes by looking at listed dependencies `dg` (nx.MultiDiGraph instance) - the Tasks configuration as a graph `app_name` (str) - the name of a scheduled application `deps` (obj) - the dependencies for given `app_name`. Should be a subclass of cb.TasksConfigBaseMapping, and is the value of given app_name's "depends_on" field """ log_details = dict(app_name=app_name, key='depends_on', deps=dict(deps)) if isinstance(deps, cb.TasksConfigBaseMapping) and "app_name" in deps: _add_edges(dg, app_name=app_name, dep_name=get_NS().dependency_group_default_name, dep_grp=deps, log_details=log_details) else: for dep_name, dep_data in deps.items(): if isinstance(dep_data, cb.TasksConfigBaseMapping): _add_edges(dg=dg, app_name=app_name, dep_name=dep_name, dep_grp=dep_data, log_details=log_details) elif isinstance(dep_data, cb.TasksConfigBaseSequence): for _dep_grp in dep_data: _add_edges(dg=dg, app_name=app_name, dep_name=dep_name, dep_grp=_dep_grp, log_details=log_details) else: _log_raise("Unrecognized dependency. Expected a list or dict", dict(dep_name=dep_name, dep_data=dep_data, **log_details), exception_kls=DAGMisconfigured)