Exemple #1
0
def pipe_regex(context=None, _INPUT=None, conf=None, **kwargs):
    """Applies regex rules to _INPUT items.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : source generator of dicts
    conf: dict
        {
            'RULE': [
                {
                    'field': {'value': 'search field'},
                    'match': {'value': 'regex'},
                    'replace': {'value': 'replacement'}
                }
            ]
        }

    Yields
    ------
    _OUTPUT : source pipe items post regexes application
    """
    rule_defs = util.listize(conf['RULE'])

    # use list bc iterator gets used up if there are no matching feeds
    rules = list(_gen_rules(rule_defs, **kwargs))

    for item in _INPUT:
        item = DotDict(item)

        def sub_fields(matchobj):
            return item.get(matchobj.group(1), **kwargs)

        for rule in rules:
            # todo: do we ever need get_value here instead of item[]?
            # when the subject being examined is an HTML node, not a
            # string then the unicode() converts the dict representing the node
            # to a dict literal, and then attempts to apply the pattern
            # to the literal; as an HTML element node, it may have attributes
            # which then appear in the literal. It should be only matching on
            # (and replacing the value of) the `.content` subelement
            # I'm not confident that what is below will work across the board
            # nor if this is the right way to detect that we're looking at
            # an HTML node and not a plain string
            if rule[0] in item and item[rule[0]]:
                sub_string = '\$\{(.+?)\}'

                if (
                    hasattr(item[rule[0]], 'keys')
                    and 'content' in item[rule[0]]
                ):
                    # this looks like an HTML node, so only do substitution on
                    # the content of the node possible gotcha: the content
                    # might be a subtree, in which case we revert to modifying
                    # the literal of the subtree dict
                    args1 = _get_args(item, rule, rule[1], rule[2], 'content')
                    args2 = _get_args(item, rule, sub_string, sub_fields)
                else:
                    args1 = _get_args(item, rule, rule[1], rule[2])
                    args2 = _get_args(item, rule, sub_string, sub_fields)

                item.set(*args1)
                item.set(*args2)

        yield item
Exemple #2
0
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs):
    """This operator loops over the input performing the embedded submodule.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        mode -- how to affect output - either assign or EMIT
        assign_to -- if mode is assign, which field to assign to
            (new or existing)

        loop_with -- pass a particular field into the submodule rather than the
            whole item
    embed -- embedded submodule

    Yields (_OUTPUT):
    source items after passing through the submodule and adding/replacing values
    """
    conf = DotDict(conf)
    mode = conf.get('mode')
    assign_to = conf.get('assign_to')
    assign_part = conf.get('assign_part')
    # TODO: what is this for??
    # emit_part = conf.get('emit_part')
    loop_with = conf.get('with')
    embed_conf = conf.get('embed')['conf']

    # Prepare the submodule to take parameters from the loop instead of from
    # the user
    embed_context = copy(context)
    embed_context.submodule = True

    for item in _INPUT:
        item = DotDict(item)
        inp = item.get(loop_with, **kwargs) if loop_with else item

        # prepare the submodule
        embed_context.inputs = dict(_gen_inputs(item, embed_conf))
        submodule = embed(embed_context, [inp], embed_conf)
        first = assign_part == 'first'
        results = _gen_results(submodule, mode, first)

        if not results:
            continue
        elif mode == 'EMIT':
            for i in results:
                yield i
        elif mode == 'assign':
            results = list(results)

            # this is a hack to make sure fetchpage works in an out of a
            # loop while not disturbing strconcat in a loop etc.
            # note: i suspect this needs to be more discerning and only happen
            # if the source can only ever deliver 1 result, e.g. strconcat vs.
            # fetchpage
            if len(results) == 1 and not hasattr(results[0], 'keys'):
                results = results[0]

            item.set(assign_to, results)
            yield item
        else:
            raise Exception(
                "Invalid mode: %s. (Expected 'assign' or 'EMIT')" % mode)