def pipe_uniq(context, _INPUT, conf, **kwargs): """This operator filters out non unique items according to the specified field. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: field -- field to be unique Yields (_OUTPUT): source items, one per unique field value """ field = util.get_value(conf['field'], None, **kwargs) order = ['%s%s' % ('', field)] #read all and sort sorted_input = [] for item in _INPUT: sorted_input.append(item) sorted_input = util.multikeysort(sorted_input, order) seen = None for item in sorted_input: #todo: do we ever need get_value here instead of item[]? v = util.get_subkey(field, item) if seen != v: yield item seen = v
def _rulepass(rule, item): field, op, value = rule data = util.get_subkey(field, item) if data is None: return False #todo check which of these should be case insensitive if op == "contains": try: if value.lower() and value.lower() in data.lower(): #todo use regex? return True except UnicodeDecodeError: pass if op == "doesnotcontain": try: if value.lower() and value.lower() not in data.lower(): #todo use regex? return True except UnicodeDecodeError: pass if op == "matches": try: if data is not None and re.search(value, data): return True except TypeError: return False if op == "is": if data == value: return True if op == "greater": try: if Decimal(data) > Decimal(value): return True except: if data > value: return True if op == "less": try: if Decimal(data) < Decimal(value): return True except: if data < value: return True if op == "after": #todo handle partial datetime values if isinstance(value, basestring): value = datetime.datetime.strptime(value, util.DATE_FORMAT).timetuple() if data > value: return True if op == "before": #todo handle partial datetime values if isinstance(value, basestring): value = datetime.datetime.strptime(value, util.DATE_FORMAT).timetuple() if data < value: return True return False
def transform_to_rss(item, conf): new = dict() for i in RSS_FIELDS: try: field_conf = conf[i] if field_conf['value']: new[i] = util.get_subkey(field_conf['value'], item) except KeyError: continue return new
def transform_to_rss(item, conf): new = dict() for i in RSS_FIELDS: try: field_conf = conf[i] if field_conf['value']: new[RSS_FIELDS[i]] = util.get_subkey(field_conf['value'], item) except KeyError: continue return new
def _rulepass(rule, item): field, op, value = rule data = util.get_subkey(field, item) if data is None: return False #todo check which of these should be case insensitive if op == "contains": try: if value.lower() and value.lower() in data.lower(): #todo use regex? return True except UnicodeDecodeError: pass if op == "doesnotcontain": try: if value.lower() and value.lower() not in data.lower(): #todo use regex? return True except UnicodeDecodeError: pass if op == "matches": if re.search(value, data): return True if op == "is": if data == value: return True if op == "greater": try: if Decimal(data) > Decimal(value): return True except: if data > value: return True if op == "less": try: if Decimal(data) < Decimal(value): return True except: if data < value: return True if op == "after": #todo handle partial datetime values if isinstance(value, basestring): value = datetime.datetime.strptime(value, util.DATE_FORMAT).timetuple() if data > value: return True if op == "before": #todo handle partial datetime values if isinstance(value, basestring): value = datetime.datetime.strptime(value, util.DATE_FORMAT).timetuple() if data < value: return True return False
def pipe_regex(context, _INPUT, conf, **kwargs): """This operator replaces values using regexes. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: RULE -- rules - each rule comprising (field, match, replace) Yields (_OUTPUT): source items after replacing values matching regexes """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: #todo use the undocumented g,s,m,i flags here: rule['singlelinematch']['value'] == 2 indicates re.DOTALL # so use that to pass to re.compile: see here for more http://livedocs.adobe.com/flex/3/html/help.html?content=12_Using_Regular_Expressions_10.html match = util.get_value(rule['match'], None, **kwargs) #todo use subkey? matchc = re.compile(match, re.DOTALL) #compile for speed and we need to pass flags replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey? if replace is None: replace = '' #convert regex to Python format: todo use a common routine for this replace = re.sub('\$(\d+)', r'\\\1', replace) #map $1 to \1 etc. #todo: also need to escape any existing \1 etc. rules.append((rule['field']['value'], matchc, replace)) for item in _INPUT: def sub_fields(matchobj): return unicode(util.get_value({'subkey':matchobj.group(1)}, item)) for rule in rules: v = util.as_unicode(util.get_subkey(rule[0], item)) v = re.sub(rule[1], rule[2], v) v = re.sub('\$\{([^\}]+)\}', sub_fields, v) util.set_value(item, rule[0], v) yield item
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs): """This operator loops over the input performing the embedded submodule. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: mode -- how to affect output - either assign or EMIT assign_to -- if mode is assign, which field to assign to (new or existing) loop_with -- pass a particular field into the submodule rather than the whole item embed -- embedded submodule Yields (_OUTPUT): source items after passing through the submodule and adding/replacing values """ mode = conf['mode']['value'] assign_to = conf['assign_to']['value'] assign_part = conf['assign_part']['value'] emit_part = conf['emit_part']['value'] loop_with = conf['with']['value'] embed_conf = conf['embed']['value']['conf'] #Prepare the submodule to take parameters from the loop instead of from the user embed_context = copy.copy(context) embed_context.submodule = True for item in _INPUT: if loop_with: inp = util.get_subkey(loop_with, item) else: inp = item #Pass any input parameters into the submodule embed_context.inputs = {} for k in embed_conf: embed_context.inputs[k] = unicode(util.get_value(embed_conf[k], item)) p = embed(embed_context, [inp], embed_conf) #prepare the submodule results = None try: #loop over the submodule, emitting as we go or collecting results for later assignment for i in p: if assign_part == 'first': if mode == 'EMIT': yield i else: results = i break else: #all if mode == 'EMIT': yield i else: if results: results.append(i) else: results = [i] if results and mode == 'assign': #this is a hack to make sure fetchpage works in an out of a loop while not disturbing strconcat in a loop etc. #(goes with the comment below about checking the delivery capability of the source) if len(results) == 1 and isinstance(results[0], dict): results = [results] except HTTPError: #todo any other errors we want to continue looping after? if context.verbose: print "Submodule gave HTTPError - continuing the loop" continue if mode == 'assign': if results and len(results) == 1: #note: i suspect this needs to be more discerning and only happen if the source can only ever deliver 1 result, e.g. strconcat vs. fetchpage results = results[0] util.set_value(item, assign_to, results) yield item elif mode == 'EMIT': pass #already yielded else: raise Exception("Invalid mode %s (expecting assign or EMIT)" % mode)
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs): """This operator loops over the input performing the embedded submodule. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: mode -- how to affect output - either assign or EMIT assign_to -- if mode is assign, which field to assign to (new or existing) loop_with -- pass a particular field into the submodule rather than the whole item embed -- embedded submodule Yields (_OUTPUT): source items after passing through the submodule and adding/replacing values """ mode = conf['mode']['value'] assign_to = conf['assign_to']['value'] assign_part = conf['assign_part']['value'] emit_part = conf['emit_part']['value'] loop_with = conf['with']['value'] embed_conf = conf['embed']['value']['conf'] #Prepare the submodule to take parameters from the loop instead of from the user embed_context = copy.copy(context) embed_context.submodule = True for item in _INPUT: if loop_with: inp = util.get_subkey(loop_with, item) else: inp = item #Pass any input parameters into the submodule embed_context.inputs = {} for k in embed_conf: embed_context.inputs[k] = unicode( util.get_value(embed_conf[k], item)) p = embed(embed_context, [inp], embed_conf) #prepare the submodule results = None try: #loop over the submodule, emitting as we go or collecting results for later assignment for i in p: if assign_part == 'first': if mode == 'EMIT': yield i else: results = i break else: #all if mode == 'EMIT': yield i else: if results: results.append(i) else: results = [i] if results and mode == 'assign': #this is a hack to make sure fetchpage works in an out of a loop while not disturbing strconcat in a loop etc. #(goes with the comment below about checking the delivery capability of the source) if len(results) == 1 and isinstance(results[0], dict): results = [results] except HTTPError: #todo any other errors we want to continue looping after? if context.verbose: print "Submodule gave HTTPError - continuing the loop" continue if mode == 'assign': if results and len( results ) == 1: #note: i suspect this needs to be more discerning and only happen if the source can only ever deliver 1 result, e.g. strconcat vs. fetchpage results = results[0] util.set_value(item, assign_to, results) yield item elif mode == 'EMIT': pass #already yielded else: raise Exception("Invalid mode %s (expecting assign or EMIT)" % mode)