def pipe_itembuilder(context, _INPUT, conf, **kwargs): """This source builds an item. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: attrs -- key, value pairs Yields (_OUTPUT): item """ attrs = conf['attrs'] for item in _INPUT: d = {} for attr in attrs: try: key = util.get_value(attr['key'], item, **kwargs) value = util.get_value(attr['value'], item, **kwargs) except KeyError: continue #ignore if the item is referenced but doesn't have our source or target field (todo: issue a warning if debugging?) util.set_value(d, key, value) yield d if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_rssitembuilder(context, _INPUT, conf, **kwargs): """This source builds an rss item. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: dictionary of key/values Yields (_OUTPUT): item """ for item in _INPUT: d = {} for key in conf: try: value = util.get_value(conf[key], item, **kwargs) #todo really dereference item? (sample pipe seems to suggest so: surprising) except KeyError: continue #ignore if the source doesn't have our source field (todo: issue a warning if debugging?) key = map_key_to_rss.get(key, key) if value: if key == 'title': util.set_value(d, 'y:%s' % key, value) #todo also for guid -> y:id (is guid the only one?) #todo try/except? util.set_value(d, key, value) yield d if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_regex(context, _INPUT, conf, **kwargs): """This operator replaces values using regexes. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: RULE -- rules - each rule comprising (field, match, replace) Yields (_OUTPUT): source items after replacing values matching regexes """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: #todo use the undocumented g,s,m,i flags here: rule['singlelinematch']['value'] == 2 indicates re.DOTALL # so use that to pass to re.compile: see here for more http://livedocs.adobe.com/flex/3/html/help.html?content=12_Using_Regular_Expressions_10.html match = util.get_value(rule['match'], None, **kwargs) #todo use subkey? matchc = re.compile( match, re.DOTALL) #compile for speed and we need to pass flags replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey? if replace is None: replace = '' #convert regex to Python format: todo use a common routine for this replace = re.sub( '\$(\d+)', r'\\\1', replace ) #map $1 to \1 etc. #todo: also need to escape any existing \1 etc. rules.append((rule['field']['value'], matchc, replace)) for item in _INPUT: def sub_fields(matchobj): return util.get_value({'subkey': matchobj.group(1)}, item) for rule in rules: #todo: do we ever need get_value here instead of item[]? if rule[0] in item and item[rule[0]]: util.set_value( item, rule[0], re.sub(rule[1], rule[2], unicode(item[rule[0]]))) util.set_value( item, rule[0], re.sub('\$\{(.+)\}', sub_fields, unicode(item[rule[0]]))) yield item
def pipe_rename(context, _INPUT, conf, **kwargs): """This operator renames or copies fields in the input source. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: RULE -- rules - each rule comprising (op, field, newval) Yields (_OUTPUT): source items after copying/renaming """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: newval = util.get_value(rule['newval'], None, **kwargs) #todo use subkey? newfield = rule['field'] #trick the get_value in the loop to mapping value onto an item key (rather than taking it literally, i.e. make it a LHS reference, not a RHS value) newfield['subkey'] = newfield['value'] del newfield['value'] rules.append((rule['op']['value'], newfield, newval)) for item in _INPUT: for rule in rules: try: value = util.get_value( rule[1], item, **kwargs) #forces an exception if any part is not found util.set_value(item, rule[2], value) if rule[0] == 'rename': try: util.del_value(item, rule[1]['subkey']) except ( KeyError, TypeError ): #TypeError catches pseudo subkeys, e.g. summary.content pass #ignore if the target doesn't have our field (todo: issue a warning if debugging?) except AttributeError: pass #ignore if the source doesn't have our field (todo: issue a warning if debugging?) yield item
def pipe_regex(context, _INPUT, conf, **kwargs): """This operator replaces values using regexes. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: RULE -- rules - each rule comprising (field, match, replace) Yields (_OUTPUT): source items after replacing values matching regexes """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: #todo use the undocumented g,s,m,i flags here: rule['singlelinematch']['value'] == 2 indicates re.DOTALL # so use that to pass to re.compile: see here for more http://livedocs.adobe.com/flex/3/html/help.html?content=12_Using_Regular_Expressions_10.html match = util.get_value(rule['match'], None, **kwargs) #todo use subkey? matchc = re.compile(match, re.DOTALL) #compile for speed and we need to pass flags replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey? if replace is None: replace = '' #convert regex to Python format: todo use a common routine for this replace = re.sub('\$(\d+)', r'\\\1', replace) #map $1 to \1 etc. #todo: also need to escape any existing \1 etc. rules.append((rule['field']['value'], matchc, replace)) for item in _INPUT: def sub_fields(matchobj): return util.get_value({'subkey':matchobj.group(1)}, item) for rule in rules: #todo: do we ever need get_value here instead of item[]? if rule[0] in item and item[rule[0]]: util.set_value(item, rule[0], re.sub(rule[1], rule[2], unicode(item[rule[0]]))) util.set_value(item, rule[0], re.sub('\$\{(.+)\}', sub_fields, unicode(item[rule[0]]))) yield item
def pipe_rename(context, _INPUT, conf, **kwargs): """This operator renames or copies fields in the input source. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: RULE -- rules - each rule comprising (op, field, newval) Yields (_OUTPUT): source items after copying/renaming """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: newval = util.get_value(rule['newval'], None, **kwargs) #todo use subkey? newfield = rule['field'] #trick the get_value in the loop to mapping value onto an item key (rather than taking it literally, i.e. make it a LHS reference, not a RHS value) newfield['subkey'] = newfield['value'] del newfield['value'] rules.append((rule['op']['value'], newfield, newval)) for item in _INPUT: for rule in rules: try: value = util.get_value(rule[1], item, **kwargs) #forces an exception if any part is not found util.set_value(item, rule[2], value) if rule[0] == 'rename': try: util.del_value(item, rule[1]['subkey']) except KeyError: pass #ignore if the target doesn't have our field (todo: issue a warning if debugging?) except AttributeError: pass #ignore if the source doesn't have our field (todo: issue a warning if debugging?) yield item
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs): """This operator loops over the input performing the embedded submodule. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: mode -- how to affect output - either assign or EMIT assign_to -- if mode is assign, which field to assign to (new or existing) loop_with -- pass a particular field into the submodule rather than the whole item embed -- embedded submodule Yields (_OUTPUT): source items after passing through the submodule and adding/replacing values """ mode = conf['mode']['value'] assign_to = conf['assign_to']['value'] assign_part = conf['assign_part']['value'] emit_part = conf['emit_part']['value'] loop_with = conf['with']['value'] embed_conf = conf['embed']['value']['conf'] #Prepare the submodule to take parameters from the loop instead of from the user embed_context = copy.copy(context) embed_context.submodule = True for item in _INPUT: if loop_with: inp = item[loop_with] #todo: get_value here? else: inp = item #Pass any input parameters into the submodule embed_context.inputs = {} for k in embed_conf: embed_context.inputs[k] = unicode(util.get_value(embed_conf[k], item)) p = embed(embed_context, [inp], embed_conf) #prepare the submodule results = None try: #loop over the submodule, emitting as we go or collecting results for later assignment for i in p: if assign_part == 'first': if mode == 'EMIT': yield i else: results = i break else: #all if mode == 'EMIT': yield i else: if results: results.append(i) else: results = [i] except HTTPError: #todo any other errors we want to continue looping after? if context.verbose: print "Submodule gave HTTPError - continuing the loop" continue if mode == 'assign': if results and len(results) == 1: results = results[0] util.set_value(item, assign_to, results) yield item elif mode == 'EMIT': pass #already yielded else: raise Exception("Invalid mode %s (expecting assign or EMIT)" % mode)
def pipe_regex(context, _INPUT, conf, **kwargs): """This operator replaces values using regexes. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: RULE -- rules - each rule comprising (field, match, replace) Yields (_OUTPUT): source items after replacing values matching regexes """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: #flags = re.DOTALL # DOTALL was the default for pipe2py previously flags = 0 if 'multilinematch' in rule: # flag 'm' flags |= re.MULTILINE if 'casematch' in rule: # flag 'i'; this name is reversed from its meaning flags |= re.IGNORECASE if 'singlelinematch' in rule: # flag 's' flags |= re.DOTALL #todo 'globalmatch' is the default in python #todo if set, re.sub() below would get count=0 and by default would get count=1 match = util.get_value(rule['match'], None, **kwargs) #todo use subkey? matchc = re.compile(match, flags) #compile for speed and we need to pass flags replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey? if replace is None: replace = '' #convert regex to Python format: todo use a common routine for this replace = re.sub('\$(\d+)', r'\\\1', replace) #map $1 to \1 etc. #todo: also need to escape any existing \1 etc. rules.append((rule['field']['value'], matchc, replace)) for item in _INPUT: def sub_fields(matchobj): return util.get_value({'subkey':matchobj.group(1)}, item) for rule in rules: #todo: do we ever need get_value here instead of item[]? #todo: when the subject being examined is an HTML node, not a string #todo: then the unicode() converts the dict representing the node #todo: to a dict literal, and then attempts to apply the pattern #todo: to the literal; as an HTML element node, it may have attributes #todo: which then appear in the literal. It should be only matching on #todo: (and replacing the value of) the .content subelement #todo: I'm not confident that what is below will work across the board #todo: nor if this is the right way to detect that we're looking at #todo: an HTML node and not a plain string if rule[0] in item and item[rule[0]]: if isinstance(item[rule[0]], dict) and 'content' in item[rule[0]]: # this looks like an HTML node, so only do substitution on the content of the node # possible gotcha: the content might be a subtree, in which case we revert # to modifying the literal of the subtree dict util.set_value(item, rule[0], re.sub(rule[1], rule[2], unicode(item[rule[0]]['content']))) util.set_value(item, rule[0], re.sub('\$\{(.+?)\}', sub_fields, unicode(item[rule[0]]))) else: util.set_value(item, rule[0], re.sub(rule[1], rule[2], unicode(item[rule[0]]))) util.set_value(item, rule[0], re.sub('\$\{(.+?)\}', sub_fields, unicode(item[rule[0]]))) yield item
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs): """This operator loops over the input performing the embedded submodule. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: mode -- how to affect output - either assign or EMIT assign_to -- if mode is assign, which field to assign to (new or existing) loop_with -- pass a particular field into the submodule rather than the whole item embed -- embedded submodule Yields (_OUTPUT): source items after passing through the submodule and adding/replacing values """ mode = conf['mode']['value'] assign_to = conf['assign_to']['value'] assign_part = conf['assign_part']['value'] emit_part = conf['emit_part']['value'] loop_with = conf['with']['value'] embed_conf = conf['embed']['value']['conf'] #Prepare the submodule to take parameters from the loop instead of from the user embed_context = copy.copy(context) embed_context.submodule = True for item in _INPUT: if loop_with: inp = util.get_subkey(loop_with, item) else: inp = item #Pass any input parameters into the submodule embed_context.inputs = {} for k in embed_conf: embed_context.inputs[k] = unicode(util.get_value(embed_conf[k], item)) p = embed(embed_context, [inp], embed_conf) #prepare the submodule results = None try: #loop over the submodule, emitting as we go or collecting results for later assignment for i in p: if assign_part == 'first': if mode == 'EMIT': yield i else: results = i break else: #all if mode == 'EMIT': yield i else: if results: results.append(i) else: results = [i] if results and mode == 'assign': #this is a hack to make sure fetchpage works in an out of a loop while not disturbing strconcat in a loop etc. #(goes with the comment below about checking the delivery capability of the source) if len(results) == 1 and isinstance(results[0], dict): results = [results] except HTTPError: #todo any other errors we want to continue looping after? if context.verbose: print "Submodule gave HTTPError - continuing the loop" continue if mode == 'assign': if results and len(results) == 1: #note: i suspect this needs to be more discerning and only happen if the source can only ever deliver 1 result, e.g. strconcat vs. fetchpage results = results[0] util.set_value(item, assign_to, results) yield item elif mode == 'EMIT': pass #already yielded else: raise Exception("Invalid mode %s (expecting assign or EMIT)" % mode)
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs): """This operator loops over the input performing the embedded submodule. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: mode -- how to affect output - either assign or EMIT assign_to -- if mode is assign, which field to assign to (new or existing) loop_with -- pass a particular field into the submodule rather than the whole item embed -- embedded submodule Yields (_OUTPUT): source items after passing through the submodule and adding/replacing values """ mode = conf['mode']['value'] assign_to = conf['assign_to']['value'] assign_part = conf['assign_part']['value'] emit_part = conf['emit_part']['value'] loop_with = conf['with']['value'] embed_conf = conf['embed']['value']['conf'] #Prepare the submodule to take parameters from the loop instead of from the user embed_context = copy.copy(context) embed_context.submodule = True for item in _INPUT: if loop_with: inp = util.get_subkey(loop_with, item) else: inp = item #Pass any input parameters into the submodule embed_context.inputs = {} for k in embed_conf: embed_context.inputs[k] = unicode( util.get_value(embed_conf[k], item)) p = embed(embed_context, [inp], embed_conf) #prepare the submodule results = None try: #loop over the submodule, emitting as we go or collecting results for later assignment for i in p: if assign_part == 'first': if mode == 'EMIT': yield i else: results = i break else: #all if mode == 'EMIT': yield i else: if results: results.append(i) else: results = [i] if results and mode == 'assign': #this is a hack to make sure fetchpage works in an out of a loop while not disturbing strconcat in a loop etc. #(goes with the comment below about checking the delivery capability of the source) if len(results) == 1 and isinstance(results[0], dict): results = [results] except HTTPError: #todo any other errors we want to continue looping after? if context.verbose: print "Submodule gave HTTPError - continuing the loop" continue if mode == 'assign': if results and len( results ) == 1: #note: i suspect this needs to be more discerning and only happen if the source can only ever deliver 1 result, e.g. strconcat vs. fetchpage results = results[0] util.set_value(item, assign_to, results) yield item elif mode == 'EMIT': pass #already yielded else: raise Exception("Invalid mode %s (expecting assign or EMIT)" % mode)
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs): """This operator loops over the input performing the embedded submodule. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: mode -- how to affect output - either assign or EMIT assign_to -- if mode is assign, which field to assign to (new or existing) loop_with -- pass a particular field into the submodule rather than the whole item embed -- embedded submodule Yields (_OUTPUT): source items after passing through the submodule and adding/replacing values """ mode = conf['mode']['value'] assign_to = conf['assign_to']['value'] assign_part = conf['assign_part']['value'] emit_part = conf['emit_part']['value'] loop_with = conf['with']['value'] embed_conf = conf['embed']['value']['conf'] #Prepare the submodule to take parameters from the loop instead of from the user embed_context = copy.copy(context) embed_context.submodule = True for item in _INPUT: if loop_with: inp = item[loop_with] #todo: get_value here? else: inp = item #Pass any input parameters into the submodule embed_context.inputs = {} for k in embed_conf: embed_context.inputs[k] = unicode( util.get_value(embed_conf[k], item)) p = embed(embed_context, [inp], embed_conf) #prepare the submodule results = None try: #loop over the submodule, emitting as we go or collecting results for later assignment for i in p: if assign_part == 'first': if mode == 'EMIT': yield i else: results = i break else: #all if mode == 'EMIT': yield i else: if results: results.append(i) else: results = [i] except HTTPError: #todo any other errors we want to continue looping after? if context.verbose: print "Submodule gave HTTPError - continuing the loop" continue if mode == 'assign': if results and len(results) == 1: results = results[0] util.set_value(item, assign_to, results) yield item elif mode == 'EMIT': pass #already yielded else: raise Exception("Invalid mode %s (expecting assign or EMIT)" % mode)