def pipe_regex(context=None, _INPUT=None, conf=None, **kwargs): """Applies regex rules to _INPUT items. Parameters ---------- context : pipe2py.Context object _INPUT : source generator of dicts conf: dict { 'RULE': [ { 'field': {'value': 'search field'}, 'match': {'value': 'regex'}, 'replace': {'value': 'replacement'} } ] } Yields ------ _OUTPUT : source pipe items post regexes application """ rule_defs = util.listize(conf['RULE']) # use list bc iterator gets used up if there are no matching feeds rules = list(_gen_rules(rule_defs, **kwargs)) for item in _INPUT: item = DotDict(item) def sub_fields(matchobj): return item.get(matchobj.group(1), **kwargs) for rule in rules: # todo: do we ever need get_value here instead of item[]? # when the subject being examined is an HTML node, not a # string then the unicode() converts the dict representing the node # to a dict literal, and then attempts to apply the pattern # to the literal; as an HTML element node, it may have attributes # which then appear in the literal. It should be only matching on # (and replacing the value of) the `.content` subelement # I'm not confident that what is below will work across the board # nor if this is the right way to detect that we're looking at # an HTML node and not a plain string if rule[0] in item and item[rule[0]]: sub_string = '\$\{(.+?)\}' if ( hasattr(item[rule[0]], 'keys') and 'content' in item[rule[0]] ): # this looks like an HTML node, so only do substitution on # the content of the node possible gotcha: the content # might be a subtree, in which case we revert to modifying # the literal of the subtree dict args1 = _get_args(item, rule, rule[1], rule[2], 'content') args2 = _get_args(item, rule, sub_string, sub_fields) else: args1 = _get_args(item, rule, rule[1], rule[2]) args2 = _get_args(item, rule, sub_string, sub_fields) item.set(*args1) item.set(*args2) yield item
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs): """This operator loops over the input performing the embedded submodule. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: mode -- how to affect output - either assign or EMIT assign_to -- if mode is assign, which field to assign to (new or existing) loop_with -- pass a particular field into the submodule rather than the whole item embed -- embedded submodule Yields (_OUTPUT): source items after passing through the submodule and adding/replacing values """ conf = DotDict(conf) mode = conf.get('mode') assign_to = conf.get('assign_to') assign_part = conf.get('assign_part') # TODO: what is this for?? # emit_part = conf.get('emit_part') loop_with = conf.get('with') embed_conf = conf.get('embed')['conf'] # Prepare the submodule to take parameters from the loop instead of from # the user embed_context = copy(context) embed_context.submodule = True for item in _INPUT: item = DotDict(item) inp = item.get(loop_with, **kwargs) if loop_with else item # prepare the submodule embed_context.inputs = dict(_gen_inputs(item, embed_conf)) submodule = embed(embed_context, [inp], embed_conf) first = assign_part == 'first' results = _gen_results(submodule, mode, first) if not results: continue elif mode == 'EMIT': for i in results: yield i elif mode == 'assign': results = list(results) # this is a hack to make sure fetchpage works in an out of a # loop while not disturbing strconcat in a loop etc. # note: i suspect this needs to be more discerning and only happen # if the source can only ever deliver 1 result, e.g. strconcat vs. # fetchpage if len(results) == 1 and not hasattr(results[0], 'keys'): results = results[0] item.set(assign_to, results) yield item else: raise Exception( "Invalid mode: %s. (Expected 'assign' or 'EMIT')" % mode)