def pipe_itembuilder(context, _INPUT, conf, **kwargs): """This source builds an item. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: attrs -- key, value pairs Yields (_OUTPUT): item """ attrs = conf['attrs'] for item in _INPUT: d = {} for attr in attrs: try: key = util.get_value(attr['key'], item, **kwargs) value = util.get_value(attr['value'], item, **kwargs) except KeyError: continue #ignore if the item is referenced but doesn't have our source or target field (todo: issue a warning if debugging?) util.set_value(d, key, value) yield d if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_sort(context, _INPUT, conf, **kwargs): """This operator sorts the input source according to the specified key. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: KEY -- list of fields to sort by Yields (_OUTPUT): source items sorted by key """ order = [] keys = conf['KEY'] if not isinstance(keys, list): keys = [keys] for key in keys: field = util.get_value(key['field'], None, **kwargs) sort_dir = util.get_value(key['dir'], None, **kwargs) order.append('%s%s' % (sort_dir == 'DESC' and '-' or '', field)) #read all and sort sorted_input = [] for item in _INPUT: sorted_input.append(item) sorted_input = util.multikeysort(sorted_input, order) for item in sorted_input: yield item
def pipe_strreplace(context, _INPUT, conf, **kwargs): """Replaces text with replacement text. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: RULE -- rules - each rule comprising (find, param, replace): find -- text to find param -- type of match: 1=first, 2=last, 3=every replace -- text to replace with Yields (_OUTPUT): source string with replacements """ rules = [] for rule in conf['RULE']: find = util.get_value(rule['find'], None, **kwargs) param = util.get_value(rule['param'], None, **kwargs) replace = util.get_value(rule['replace'], None, **kwargs) rules.append((find, param, replace)) for item in _INPUT: t = item for rule in rules: if rule[1] == '1': t = t.replace(rule[0], rule[2], 1) elif rule[1] == '2': t = util.rreplace(t, rule[0], rule[2], 1) elif rule[1] == '3': t = t.replace(rule[0], rule[2]) #todo else assertion yield t
def pipe_sort(context, _INPUT, conf, **kwargs): """This operator sorts the input source according to the specified key. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: KEY -- list of fields to sort by Yields (_OUTPUT): source items sorted by key """ order = [] keys = conf['KEY'] if not isinstance(keys, list): keys = [keys] for key in keys: field = util.get_value(key['field'], None, **kwargs) sort_dir = util.get_value(key['dir'], None, **kwargs) order.append('%s%s' % (sort_dir=='DESC' and '-' or '', field)) #read all and sort sorted_input = [] for item in _INPUT: sorted_input.append(item) sorted_input = util.multikeysort(sorted_input, order) for item in sorted_input: yield item
def pipe_strregex(context, _INPUT, conf, **kwargs): """This operator replaces values using regexes. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: RULE -- rules - each rule comprising (match, replace) Yields (_OUTPUT): source item after replacing values matching regexes """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: #TODO compile regex here: c = re.compile(match) match = util.get_value(rule['match'], None, **kwargs) #todo use subkey? replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey? #convert regex to Python format: todo use a common routine for this replace = re.sub('\$(\d+)', r'\\\1', replace) #map $1 to \1 etc. #todo: also need to escape any existing \1 etc. rules.append((match, replace)) for item in _INPUT: for rule in rules: item = re.sub(match, replace, item) yield item
def pipe_regex(context, _INPUT, conf, **kwargs): """This operator replaces values using regexes. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: RULE -- rules - each rule comprising (field, match, replace) Yields (_OUTPUT): source items after replacing values matching regexes """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: #todo use the undocumented g,s,m,i flags here: rule['singlelinematch']['value'] == 2 indicates re.DOTALL # so use that to pass to re.compile: see here for more http://livedocs.adobe.com/flex/3/html/help.html?content=12_Using_Regular_Expressions_10.html match = util.get_value(rule['match'], None, **kwargs) #todo use subkey? matchc = re.compile( match, re.DOTALL) #compile for speed and we need to pass flags replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey? if replace is None: replace = '' #convert regex to Python format: todo use a common routine for this replace = re.sub( '\$(\d+)', r'\\\1', replace ) #map $1 to \1 etc. #todo: also need to escape any existing \1 etc. rules.append((rule['field']['value'], matchc, replace)) for item in _INPUT: def sub_fields(matchobj): return util.get_value({'subkey': matchobj.group(1)}, item) for rule in rules: #todo: do we ever need get_value here instead of item[]? if rule[0] in item and item[rule[0]]: util.set_value( item, rule[0], re.sub(rule[1], rule[2], unicode(item[rule[0]]))) util.set_value( item, rule[0], re.sub('\$\{(.+)\}', sub_fields, unicode(item[rule[0]]))) yield item
def pipe_fetchsitefeed(context, _INPUT, conf, **kwargs): """This source fetches and parses the first feed found on one or more sites to yield the feed entries. Keyword arguments: context -- pipeline context _INPUT -- not used conf: URL -- url Yields (_OUTPUT): feed entries """ forever = pipe_forever(context, None, conf=None) urls = conf['URL'] if not isinstance(urls, list): urls = [urls] for item in _INPUT: for item_url in urls: url = util.get_value(item_url, item, **kwargs) if not '://' in url: url = 'http://' + url if context.verbose: print "pipe_fetchsitefeed loading:", url for feed in pipe_feedautodiscovery(context, forever, {u'URL': {u'type': u'url', u'value': url}}): for feed_item in pipe_fetch(context, forever, {u'URL': {u'type': u'url', u'value': feed['link']}}): yield feed_item if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_tail(context, _INPUT, conf, **kwargs): """This operator truncates the number of items in a feed. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- terminal, if the truncation value is wired in conf: count -- length of the truncated feed, if specified literally Yields (_OUTPUT): tail-truncated list of source items """ count = conf['count'] limit = int(util.get_value(count, None, **kwargs)) try: #if python 2.6+ we can use a sliding window and save memory from collections import deque buffer = deque(_INPUT, limit) except: buffer = [] for item in _INPUT: buffer.append(item) #slice [-limit:] in a list/deque compatible way for i in xrange(-1, -(min(len(buffer), limit)+1), -1): yield buffer[i]
def pipe_uniq(context, _INPUT, conf, **kwargs): """This operator filters out non unique items according to the specified field. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: field -- field to be unique Yields (_OUTPUT): source items, one per unique field value """ field = util.get_value(conf['field'], None, **kwargs) order = ['%s%s' % ('', field)] #read all and sort sorted_input = [] for item in _INPUT: sorted_input.append(item) sorted_input = util.multikeysort(sorted_input, order) seen = None for item in sorted_input: #todo: do we ever need get_value here instead of item[]? if seen != item[field]: yield item seen = item[field]
def pipe_feedautodiscovery(context, _INPUT, conf, **kwargs): """This source search for feed links in a page Keyword arguments: context -- pipeline context _INPUT -- not used conf: URL -- url Yields (_OUTPUT): feed entries """ urls = conf['URL'] if not isinstance(urls, list): urls = [urls] for item in _INPUT: for item_url in urls: url = util.get_value(item_url, item, **kwargs) if not '://' in url: url = 'http://' + url if context.verbose: print "pipe_feedautodiscovery loading:", url d = autorss.getRSSLink(url.encode('utf-8')) for entry in d: yield {'link':entry} #todo add rel, type, title if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_tail(context, _INPUT, conf, **kwargs): """This operator truncates the number of items in a feed. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- terminal, if the truncation value is wired in conf: count -- length of the truncated feed, if specified literally Yields (_OUTPUT): tail-truncated list of source items """ count = conf['count'] limit = int(util.get_value(count, None, **kwargs)) try: #if python 2.6+ we can use a sliding window and save memory from collections import deque buffer = deque(_INPUT, limit) except: buffer = [] for item in _INPUT: buffer.append(item) #slice [-limit:] in a list/deque compatible way for i in xrange(-1, -(min(len(buffer), limit) + 1), -1): yield buffer[i]
def pipe_feedautodiscovery(context=None, _INPUT=None, conf=None, **kwargs): """This source search for feed links in a page Keyword arguments: context -- pipeline context _INPUT -- not used conf: URL -- url Yields (_OUTPUT): feed entries """ conf = DotDict(conf) urls = util.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = util.get_value(DotDict(item_url), DotDict(item), **kwargs) url = util.get_abspath(url) if context and context.verbose: print "pipe_feedautodiscovery loading:", url for entry in autorss.getRSSLink(url.encode('utf-8')): yield {'link': entry} # todo: add rel, type, title if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_uniq(context, _INPUT, conf, **kwargs): """This operator filters out non unique items according to the specified field. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: field -- field to be unique Yields (_OUTPUT): source items, one per unique field value """ field = util.get_value(conf['field'], None, **kwargs) order = ['%s%s' % ('', field)] #read all and sort sorted_input = [] for item in _INPUT: sorted_input.append(item) sorted_input = util.multikeysort(sorted_input, order) seen = None for item in sorted_input: #todo: do we ever need get_value here instead of item[]? v = util.get_subkey(field, item) if seen != v: yield item seen = v
def pipe_subelement(context, _INPUT, conf, **kwargs): """Returns a subelement. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: path -- contains the value and type to select Yields (_OUTPUT): subelement of source item """ path = conf['path'] path['subkey'] = path['value'] #switch to using as a reference del path['value'] for item in _INPUT: t = util.get_value(path, item) if t: if isinstance(t, list): for nested_item in t: yield nested_item else: yield t if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_strconcat(context, _INPUT, conf, **kwargs): """This source builds a string. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: part -- parts Yields (_OUTPUT): string """ if not isinstance(conf['part'], list): #todo do we need to do this anywhere else? conf['part'] = [conf['part']] for item in _INPUT: s = "" for part in conf['part']: try: s += util.get_value(part, item, **kwargs) except AttributeError: continue #ignore if the item is referenced but doesn't have our source field (todo: issue a warning if debugging?) except TypeError: if context.verbose: print "pipe_strconcat: TypeError" yield s
def pipe_rssitembuilder(context, _INPUT, conf, **kwargs): """This source builds an rss item. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: dictionary of key/values Yields (_OUTPUT): item """ for item in _INPUT: d = {} for key in conf: try: value = util.get_value(conf[key], item, **kwargs) #todo really dereference item? (sample pipe seems to suggest so: surprising) except KeyError: continue #ignore if the source doesn't have our source field (todo: issue a warning if debugging?) key = map_key_to_rss.get(key, key) if value: if key == 'title': util.set_value(d, 'y:%s' % key, value) #todo also for guid -> y:id (is guid the only one?) #todo try/except? util.set_value(d, key, value) yield d if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_dateformat(context, _INPUT, conf, **kwargs): """This source formats a date. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: format -- date format Yields (_OUTPUT): formatted date """ date_format = util.get_value(conf['format'], None, **kwargs) for item in _INPUT: s = item if isinstance(s, basestring): for df in util.ALTERNATIVE_DATE_FORMATS: try: s = datetime.strptime(s, df).timetuple() break except: pass else: #todo: raise an exception: unexpected date format pass s = time.strftime(date_format, s) #todo check all PHP formats are covered by Python #todo silent error handling? e.g. if item is not a date yield s
def pipe_fetch(context=None, _INPUT=None, conf=None, **kwargs): """Fetches and parses one or more feeds to yield the feed entries. Keyword arguments: context -- pipeline context _INPUT -- not used conf: URL -- url Yields (_OUTPUT): feed entries """ conf = DotDict(conf) urls = util.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = util.get_value(DotDict(item_url), DotDict(item), **kwargs) url = util.get_abspath(url) if not url: continue if context and context.verbose: print "pipe_fetch loading:", url parsed = feedparser.parse(urlopen(url).read()) for entry in util.gen_entries(parsed): yield entry if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_rename(context, _INPUT, conf, **kwargs): """This operator renames or copies fields in the input source. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: RULE -- rules - each rule comprising (op, field, newval) Yields (_OUTPUT): source items after copying/renaming """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: newval = util.get_value(rule['newval'], None, **kwargs) #todo use subkey? newfield = rule['field'] #trick the get_value in the loop to mapping value onto an item key (rather than taking it literally, i.e. make it a LHS reference, not a RHS value) newfield['subkey'] = newfield['value'] del newfield['value'] rules.append((rule['op']['value'], newfield, newval)) for item in _INPUT: for rule in rules: try: value = util.get_value( rule[1], item, **kwargs) #forces an exception if any part is not found util.set_value(item, rule[2], value) if rule[0] == 'rename': try: util.del_value(item, rule[1]['subkey']) except ( KeyError, TypeError ): #TypeError catches pseudo subkeys, e.g. summary.content pass #ignore if the target doesn't have our field (todo: issue a warning if debugging?) except AttributeError: pass #ignore if the source doesn't have our field (todo: issue a warning if debugging?) yield item
def pipe_regex(context, _INPUT, conf, **kwargs): """This operator replaces values using regexes. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: RULE -- rules - each rule comprising (field, match, replace) Yields (_OUTPUT): source items after replacing values matching regexes """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: #todo use the undocumented g,s,m,i flags here: rule['singlelinematch']['value'] == 2 indicates re.DOTALL # so use that to pass to re.compile: see here for more http://livedocs.adobe.com/flex/3/html/help.html?content=12_Using_Regular_Expressions_10.html match = util.get_value(rule['match'], None, **kwargs) #todo use subkey? matchc = re.compile(match, re.DOTALL) #compile for speed and we need to pass flags replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey? if replace is None: replace = '' #convert regex to Python format: todo use a common routine for this replace = re.sub('\$(\d+)', r'\\\1', replace) #map $1 to \1 etc. #todo: also need to escape any existing \1 etc. rules.append((rule['field']['value'], matchc, replace)) for item in _INPUT: def sub_fields(matchobj): return util.get_value({'subkey':matchobj.group(1)}, item) for rule in rules: #todo: do we ever need get_value here instead of item[]? if rule[0] in item and item[rule[0]]: util.set_value(item, rule[0], re.sub(rule[1], rule[2], unicode(item[rule[0]]))) util.set_value(item, rule[0], re.sub('\$\{(.+)\}', sub_fields, unicode(item[rule[0]]))) yield item
def pipe_csv(context, _INPUT, conf, **kwargs): """This source fetches and parses a csv file to yield items. Keyword arguments: context -- pipeline context _INPUT -- not used conf: URL -- url skip -- number of header rows to skip col_mode -- column name source: row=header row(s), custom=defined in col_name col_name -- list of custom column names col_row_start -- first column header row col_row_end -- last column header row separator -- column separator Yields (_OUTPUT): file entries Note: Current restrictions: separator must be 1 character assumes every row has exactly the expected number of fields, as defined in the header """ col_name = conf['col_name'] for item in _INPUT: url = util.get_value(conf['URL'], item, **kwargs) separator = util.get_value(conf['separator'], item, **kwargs).encode('utf-8') skip = int(util.get_value(conf['skip'], item, **kwargs)) col_mode = util.get_value(conf['col_mode'], item, **kwargs) col_row_start = int( util.get_value(conf['col_row_start'], item, **kwargs)) col_row_end = int(util.get_value(conf['col_row_end'], item, **kwargs)) f = urllib2.urlopen(url) if context.verbose: print "pipe_csv loading:", url for i in xrange(skip): f.next() reader = UnicodeReader(f, delimiter=separator) fieldnames = [] if col_mode == 'custom': fieldnames = [util.get_value(x) for x in col_name] else: for row in xrange((col_row_end - col_row_start) + 1): row = reader.next() fieldnames.extend(row) for row in reader: d = dict(zip(fieldnames, row)) yield d if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_urlbuilder(context, _INPUT, conf, **kwargs): """This source builds a url and yields it forever. Keyword arguments: context -- pipeline context _INPUT -- not used conf: BASE -- base PATH -- path elements PARAM -- query parameters Yields (_OUTPUT): url """ for item in _INPUT: #note: we could cache get_value results if item==True url = util.get_value(conf['BASE'], item, **kwargs) if not url.endswith('/'): url += '/' if 'PATH' in conf: path = conf['PATH'] if not isinstance(path, list): path = [path] path = [util.get_value(p, item, **kwargs) for p in path if p] url += "/".join(p for p in path if p) url = url.rstrip("/") #Ensure url is valid url = util.url_quote(url) param_defs = conf['PARAM'] if not isinstance(param_defs, list): param_defs = [param_defs] params = dict([(util.get_value(p['key'], item, **kwargs), util.get_value(p['value'], item, **kwargs)) for p in param_defs if p]) if params and params.keys() != [u'']: url += "?" + urllib.urlencode(params) yield url
def pipe_csv(context, _INPUT, conf, **kwargs): """This source fetches and parses a csv file to yield items. Keyword arguments: context -- pipeline context _INPUT -- not used conf: URL -- url skip -- number of header rows to skip col_mode -- column name source: row=header row(s), custom=defined in col_name col_name -- list of custom column names col_row_start -- first column header row col_row_end -- last column header row separator -- column separator Yields (_OUTPUT): file entries Note: Current restrictions: separator must be 1 character assumes every row has exactly the expected number of fields, as defined in the header """ col_name = conf['col_name'] for item in _INPUT: url = util.get_value(conf['URL'], item, **kwargs) separator = util.get_value(conf['separator'], item, **kwargs).encode('utf-8') skip = int(util.get_value(conf['skip'], item, **kwargs)) col_mode = util.get_value(conf['col_mode'], item, **kwargs) col_row_start = int(util.get_value(conf['col_row_start'], item, **kwargs)) col_row_end = int(util.get_value(conf['col_row_end'], item, **kwargs)) f = urllib2.urlopen(url) if context.verbose: print "pipe_csv loading:", url for i in xrange(skip): f.next() reader = UnicodeReader(f, delimiter=separator) fieldnames = [] if col_mode == 'custom': fieldnames = [util.get_value(x) for x in col_name] else: for row in xrange((col_row_end - col_row_start) +1): row = reader.next() fieldnames.extend(row) for row in reader: d = dict(zip(fieldnames, row)) yield d if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_urlbuilder(context, _INPUT, conf, **kwargs): """This source builds a url and yields it forever. Keyword arguments: context -- pipeline context _INPUT -- not used conf: BASE -- base PATH -- path elements PARAM -- query parameters Yields (_OUTPUT): url """ for item in _INPUT: #note: we could cache get_value results if item==True url = util.get_value(conf['BASE'], item, **kwargs) if not url.endswith('/'): url += '/' if 'PATH' in conf: path = conf['PATH'] if not isinstance(path, list): path = [path] path = [util.get_value(p, item, **kwargs) for p in path if p] url += "/".join(str(p) for p in path if p) url = url.rstrip("/") #Ensure url is valid url = util.url_quote(url) param_defs = conf['PARAM'] if not isinstance(param_defs, list): param_defs = [param_defs] params = dict([(util.get_value(p['key'], item, **kwargs), util.get_value(p['value'], item, **kwargs)) for p in param_defs if p]) if params and params.keys() != [u'']: url += "?" + urllib.urlencode(params) yield url
def pipe_simplemath(context, _INPUT, conf, **kwargs): """This operator performs basic arithmetic, such as addition and subtraction. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other value, if wired in conf: other -- input value op -- operator Yields (_OUTPUT): result """ value = float(util.get_value(conf['OTHER'], None, **kwargs)) op = util.get_value(conf['OP'], None, **kwargs) for item in _INPUT: yield OPS[op](float(item), value)
def pipe_rename(context, _INPUT, conf, **kwargs): """This operator renames or copies fields in the input source. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: RULE -- rules - each rule comprising (op, field, newval) Yields (_OUTPUT): source items after copying/renaming """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: newval = util.get_value(rule['newval'], None, **kwargs) #todo use subkey? newfield = rule['field'] #trick the get_value in the loop to mapping value onto an item key (rather than taking it literally, i.e. make it a LHS reference, not a RHS value) newfield['subkey'] = newfield['value'] del newfield['value'] rules.append((rule['op']['value'], newfield, newval)) for item in _INPUT: for rule in rules: try: value = util.get_value(rule[1], item, **kwargs) #forces an exception if any part is not found util.set_value(item, rule[2], value) if rule[0] == 'rename': try: util.del_value(item, rule[1]['subkey']) except KeyError: pass #ignore if the target doesn't have our field (todo: issue a warning if debugging?) except AttributeError: pass #ignore if the source doesn't have our field (todo: issue a warning if debugging?) yield item
def pipe_substr(context, _INPUT, conf, **kwargs): """Returns a substring. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: from -- starting character length -- number of characters to return Yields (_OUTPUT): portion of source string """ sfrom = int(util.get_value(conf['from'], None, **kwargs)) length = int(util.get_value(conf['length'], None, **kwargs)) for item in _INPUT: yield item[sfrom:sfrom+length] if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_substr(context, _INPUT, conf, **kwargs): """Returns a substring. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: from -- starting character length -- number of characters to return Yields (_OUTPUT): portion of source string """ sfrom = int(util.get_value(conf['from'], None, **kwargs)) length = int(util.get_value(conf['length'], None, **kwargs)) for item in _INPUT: yield item[sfrom:sfrom + length] if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_strregex(context, _INPUT, conf, **kwargs): """This operator replaces values using regexes. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: RULE -- rules - each rule comprising (match, replace) Yields (_OUTPUT): source item after replacing values matching regexes """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: #TODO compile regex here: c = re.compile(match) match = util.get_value(rule['match'], None, **kwargs) #todo use subkey? replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey? #convert regex to Python format: todo use a common routine for this replace = re.sub( '\$(\d+)', r'\\\1', replace ) #map $1 to \1 etc. #todo: also need to escape any existing \1 etc. if replace is None: replace = '' rules.append((match, replace)) for item in _INPUT: for rule in rules: item = re.sub(match, replace, item) yield item
def pipe_strreplace(context, _INPUT, conf, **kwargs): """Replaces text with replacement text. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: RULE -- rules - each rule comprising (find, param, replace): find -- text to find param -- type of match: 1=first, 2=last, 3=every replace -- text to replace with Yields (_OUTPUT): source string with replacements """ rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: find = util.get_value(rule['find'], None, **kwargs) param = util.get_value(rule['param'], None, **kwargs) replace = util.get_value(rule['replace'], None, **kwargs) rules.append((find, param, replace)) for item in _INPUT: t = item for rule in rules: if rule[1] == '1': t = t.replace(rule[0], rule[2], 1) elif rule[1] == '2': t = util.rreplace(t, rule[0], rule[2], 1) elif rule[1] == '3': t = t.replace(rule[0], rule[2]) #todo else assertion yield t
def pipe_fetch(context, _INPUT, conf, **kwargs): """This source fetches and parses one or more feeds to yield the feed entries. Keyword arguments: context -- pipeline context _INPUT -- not used conf: URL -- url Yields (_OUTPUT): feed entries """ urls = conf['URL'] if not isinstance(urls, list): urls = [urls] for item in _INPUT: for item_url in urls: url = util.get_value(item_url, item, **kwargs) if not '://' in url: url = 'http://' + url if context.verbose: print "pipe_fetch loading:", url d = feedparser.parse(url.encode('utf-8')) for entry in d['entries']: if 'updated_parsed' in entry: entry['pubDate'] = entry[ 'updated_parsed'] #map from universal feedparser's normalised names entry['y:published'] = entry[ 'updated_parsed'] #yahoo's own version if 'author' in entry: entry['dc:creator'] = entry['author'] if 'author_detail' in entry: if 'href' in entry['author_detail']: entry['author.uri'] = entry['author_detail']['href'] if 'name' in entry['author_detail']: entry['author.name'] = entry['author_detail']['name'] #todo more!? if 'title' in entry: entry['y:title'] = entry['title'] #yahoo's own versions if 'id' in entry: entry['y:id'] = entry['id'] #yahoo's own versions #todo more!? yield entry if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_fetch(context, _INPUT, conf, **kwargs): """This source fetches and parses one or more feeds to yield the feed entries. Keyword arguments: context -- pipeline context _INPUT -- not used conf: URL -- url Yields (_OUTPUT): feed entries """ urls = conf['URL'] if not isinstance(urls, list): urls = [urls] for item in _INPUT: for item_url in urls: url = util.get_value(item_url, item, **kwargs) if not '://' in url: url = 'http://' + url if context.verbose: print "pipe_fetch loading:", url d = feedparser.parse(url.encode('utf-8')) for entry in d['entries']: if 'updated_parsed' in entry: entry['pubDate'] = entry['updated_parsed'] #map from universal feedparser's normalised names entry['y:published'] = entry['updated_parsed'] #yahoo's own version if 'author' in entry: entry['dc:creator'] = entry['author'] if 'author_detail' in entry: if 'href' in entry['author_detail']: entry['author.uri'] = entry['author_detail']['href'] if 'name' in entry['author_detail']: entry['author.name'] = entry['author_detail']['name'] #todo more!? if 'title' in entry: entry['y:title'] = entry['title'] #yahoo's own versions if 'id' in entry: entry['y:id'] = entry['id'] #yahoo's own versions #todo more!? yield entry if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_datebuilder(context, _INPUT, conf, **kwargs): """This source builds a date and yields it forever. Keyword arguments: context -- pipeline context _INPUT -- XXX conf: DATE -- date Yields (_OUTPUT): date """ for item in _INPUT: date = util.get_value(conf['DATE'], item, **kwargs) try: date = float(date) except ValueError: pass if type(date) == float or type(date) == int: date = datetime.utcfromtimestamp(date) else: date = str(date).lower() if date.endswith(' day') or date.endswith(' days'): count = int(date.split(' ')[0]) date = (datetime.utcnow() + timedelta(days=count)) elif date.endswith(' year') or date.endswith(' years'): count = int(date.split(' ')[0]) date = datetime.utcnow() date = date.replace(year = date.year + count) elif date == 'today': date = datetime.utcnow() elif date == 'tomorrow': date = (datetime.utcnow() + timedelta(days=1)) elif date == 'yesterday': date = (datetime.utcnow() + timedelta(days=-1)) elif date == 'now': #todo is this allowed by Yahoo? date = datetime.utcnow() else: for df in util.ALTERNATIVE_DATE_FORMATS: try: date = datetime.strptime(date, df) break except: pass else: #todo: raise an exception: unexpected date format pass yield date
def pipe_yql(context=None, _INPUT=None, conf=None, **kwargs): """This source issues YQL queries. Keyword arguments: context -- pipeline context _INPUT -- not used conf: yqlquery -- YQL query # todo: handle envURL Yields (_OUTPUT): query results """ # todo: get from a config/env file url = "http://query.yahooapis.com/v1/public/yql" conf = DotDict(conf) query = conf['yqlquery'] for item in _INPUT: item = DotDict(item) yql = util.get_value(query, item, **kwargs) # note: we use the default format of xml since json loses some # structure # todo: diagnostics=true e.g. if context.test # todo: consider paging for large result sets r = requests.get(url, params={'q': yql}, stream=True) # Parse the response tree = parse(r.raw) if context and context.verbose: print "pipe_yql loading xml:", yql root = tree.getroot() # note: query also has row count results = root.find('results') # Convert xml into generation of dicts for element in results.getchildren(): yield util.etree_to_dict(element) if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_truncate(context, _INPUT, conf, **kwargs): """This operator truncates the number of items in a feed. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- terminal, if the truncation value is wired in conf: count -- length of the truncated feed, if specified literally Yields (_OUTPUT): truncated list of source items """ count = conf['count'] limit = int(util.get_value(count, None, **kwargs)) for i in xrange(0, limit): yield _INPUT.next()
def pipe_datebuilder(context, _INPUT, conf, **kwargs): """This source builds a date and yields it forever. Keyword arguments: context -- pipeline context _INPUT -- XXX conf: DATE -- date Yields (_OUTPUT): date """ for item in _INPUT: date = util.get_value(conf['DATE'], item, **kwargs).lower() if date.endswith(' day') or date.endswith(' days'): count = int(date.split(' ')[0]) date = (datetime.today() + timedelta(days=count)).timetuple() elif date.endswith(' year') or date.endswith(' years'): count = int(date.split(' ')[0]) date = datetime.today().replace(year=datetime.today().year + count).timetuple() elif date == 'today': date = datetime.today().timetuple() elif date == 'tomorrow': date = (datetime.today() + timedelta(days=1)).timetuple() elif date == 'yesterday': date = (datetime.today() + timedelta(days=-1)).timetuple() elif date == 'now': #todo is this allowed by Yahoo? date = datetime.now().timetuple() #better to use utcnow? else: for df in util.ALTERNATIVE_DATE_FORMATS: try: date = datetime.strptime(date, df).timetuple() break except: pass else: #todo: raise an exception: unexpected date format pass yield date
def pipe_stringtokenizer(context, _INPUT, conf, **kwargs): """Splits a string into tokens delimited by separators. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: to-str -- separator string Yields (_OUTPUT): tokens of the input string """ delim = util.get_value(conf['to-str'], None, **kwargs) for item in _INPUT: if item is not None: yield item.split(delim) if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_datebuilder(context, _INPUT, conf, **kwargs): """This source builds a date and yields it forever. Keyword arguments: context -- pipeline context _INPUT -- XXX conf: DATE -- date Yields (_OUTPUT): date """ for item in _INPUT: date = util.get_value(conf["DATE"], item, **kwargs).lower() if date.endswith(" day") or date.endswith(" days"): count = int(date.split(" ")[0]) date = (datetime.today() + timedelta(days=count)).timetuple() elif date.endswith(" year") or date.endswith(" years"): count = int(date.split(" ")[0]) date = datetime.today().replace(year=datetime.today().year + count).timetuple() elif date == "today": date = datetime.today().timetuple() elif date == "tomorrow": date = (datetime.today() + timedelta(days=1)).timetuple() elif date == "yesterday": date = (datetime.today() + timedelta(days=-1)).timetuple() elif date == "now": # todo is this allowed by Yahoo? date = datetime.now().timetuple() # better to use utcnow? else: for df in util.ALTERNATIVE_DATE_FORMATS: try: date = datetime.strptime(date, df).timetuple() break except: pass else: # todo: raise an exception: unexpected date format pass yield date
def pipe_yql(context, _INPUT, conf, **kwargs): """This source issues YQL queries. Keyword arguments: context -- pipeline context _INPUT -- not used conf: yqlquery -- YQL query #todo handle envURL Yields (_OUTPUT): query results """ url = "http://query.yahooapis.com/v1/public/yql" #todo get from a config/env file for item in _INPUT: yql = util.get_value(conf['yqlquery'], item, **kwargs) query = urllib.urlencode({ 'q': yql, #note: we use the default format of xml since json loses some structure #todo diagnostics=true e.g. if context.test #todo consider paging for large result sets }) req = urllib2.Request(url, query) response = urllib2.urlopen(req) #Parse the response ft = ElementTree.parse(response) if context.verbose: print "pipe_yql loading xml:", yql root = ft.getroot() #note: query also has row count results = root.find('results') #Convert xml into generation of dicts for element in results.getchildren(): i = util.xml_to_dict(element) yield i if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_stringtokenizer(context, _INPUT, conf, **kwargs): """Splits a string into tokens delimited by separators. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: to-str -- separator string Yields (_OUTPUT): tokens of the input string """ delim = util.get_value(conf['to-str'], None, **kwargs) for item in _INPUT: if item is not None: for chunk in item.split(delim): yield {'content':chunk} if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_yql(context, _INPUT, conf, **kwargs): """This source issues YQL queries. Keyword arguments: context -- pipeline context _INPUT -- not used conf: yqlquery -- YQL query #todo handle envURL Yields (_OUTPUT): query results """ url = "http://query.yahooapis.com/v1/public/yql" #todo get from a config/env file for item in _INPUT: yql = util.get_value(conf['yqlquery'], item, **kwargs) query = urllib.urlencode({'q':yql, #note: we use the default format of xml since json loses some structure #todo diagnostics=true e.g. if context.test #todo consider paging for large result sets }) req = urllib2.Request(url, query) response = urllib2.urlopen(req) #Parse the response ft = ElementTree.parse(response) if context.verbose: print "pipe_yql loading xml:", yql root = ft.getroot() #note: query also has row count results = root.find('results') #Convert xml into generation of dicts for element in results.getchildren(): i = util.xml_to_dict(element) yield i if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_truncate(context, _INPUT, conf, **kwargs): """This operator truncates the number of items in a feed. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- terminal, if the truncation value is wired in conf: count -- length of the truncated feed, if specified literally Yields (_OUTPUT): truncated list of source items """ count = conf["count"] limit = int(util.get_value(count, None, **kwargs)) i = 0 for item in _INPUT: if i >= limit: break yield item i += 1
def pipe_filter(context, _INPUT, conf, **kwargs): """This operator filters the input source, including or excluding fields, that match a set of defined rules. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: MODE -- filter mode, either "permit" or "block" COMBINE -- filter boolean combination, either "and" or "or" RULE -- rules - each rule comprising (field, op, value) Yields (_OUTPUT): source items that match the rules """ mode = conf['MODE']['value'] combine = conf['COMBINE']['value'] rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: field = rule['field']['value'] value = util.get_value(rule['value'], None, **kwargs) #todo use subkey? rules.append((field, rule['op']['value'], value)) for item in _INPUT: if combine in COMBINE_BOOLEAN: res = COMBINE_BOOLEAN[combine](_rulepass(rule, item) for rule in rules) else: raise Exception("Invalid combine %s (expecting and or or)" % combine) if (res and mode == "permit") or (not res and mode == "block"): yield item
def _convert_item(rules, item, **kwargs): for rule in rules: value = util.get_value(rule[1], item, **kwargs) try: # forces an exception if any part is not found item.set(rule[2], value) except AttributeError: # ignore if the source doesn't have our field # todo: issue a warning if debugging? pass if rule[0] == 'rename': try: item.delete(rule[1]['subkey']) # TypeError catches pseudo subkeys, e.g. summary.content except (KeyError, TypeError): # ignore if the target doesn't have our field # todo: issue a warning if debugging? pass return item
def pipe_filter(context, _INPUT, conf, **kwargs): """This operator filters the input source, including or excluding fields, that match a set of defined rules. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: MODE -- filter mode, either "permit" or "block" COMBINE -- filter boolean combination, either "and" or "or" RULE -- rules - each rule comprising (field, op, value) Yields (_OUTPUT): source items that match the rules """ mode = conf['MODE']['value'] combine = conf['COMBINE']['value'] rules = [] rule_defs = conf['RULE'] if not isinstance(rule_defs, list): rule_defs = [rule_defs] for rule in rule_defs: field = rule['field']['value'] value = util.get_value(rule['value'], None, **kwargs) #todo use subkey? rules.append((field, rule['op']['value'], value)) for item in _INPUT: if item == True: break if combine in COMBINE_BOOLEAN: res = COMBINE_BOOLEAN[combine](_rulepass(rule, item) for rule in rules) else: raise Exception("Invalid combine %s (expecting and or or)" % combine) if (res and mode == "permit") or (not res and mode == "block"): yield item
def pipe_fetchdata(context, _INPUT, conf, **kwargs): """This source fetches and parses any XML or JSON file (todo iCal or KML) to yield a list of elements. Keyword arguments: context -- pipeline context _INPUT -- not used conf: URL -- url path -- path to list Yields (_OUTPUT): elements """ urls = conf['URL'] if not isinstance(urls, list): urls = [urls] for item in _INPUT: for item_url in urls: url = util.get_value(item_url, item, **kwargs) if not '://' in url: url = 'http://' + url path = util.get_value(conf['path'], item, **kwargs) match = None #Parse the file into a dictionary try: f = urllib2.urlopen(url) ft = ElementTree.parse(f) if context.verbose: print "pipe_fetchdata loading xml:", url root = ft.getroot() #Move to the point referenced by the path #todo lxml would simplify and speed up this if path: if root.tag[0] == '{': namespace = root.tag[1:].split("}")[0] for i in path.split(".")[:-1]: root = root.find("{%s}%s" % (namespace, i)) if root is None: return match = "{%s}%s" % (namespace, path.split(".")[-1]) else: match = "%s" % (path.split(".")[-1]) #Convert xml into generation of dicts if match: for element in root.findall(match): i = util.etree_to_pipes(element) yield i else: i = util.etree_to_pipes(root) yield i except Exception, e: try: f = urllib2.urlopen(url) d = json.load(f) #todo test:- if context.verbose: print "pipe_fetchdata loading json:", url if path: for i in path.split(".")[:-1]: d = d.get(i) match = path.split(".")[-1] if match: for itemd in d: if not match or itemd == match: if isinstance(d[itemd], list): for nested_item in d[itemd]: yield nested_item else: yield [d[itemd]] else: yield d except Exception, e: #todo try iCal and yield #todo try KML and yield if context.verbose: print "xml and json both failed:" raise
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs): """This operator loops over the input performing the embedded submodule. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: mode -- how to affect output - either assign or EMIT assign_to -- if mode is assign, which field to assign to (new or existing) loop_with -- pass a particular field into the submodule rather than the whole item embed -- embedded submodule Yields (_OUTPUT): source items after passing through the submodule and adding/replacing values """ mode = conf['mode']['value'] assign_to = conf['assign_to']['value'] assign_part = conf['assign_part']['value'] emit_part = conf['emit_part']['value'] loop_with = conf['with']['value'] embed_conf = conf['embed']['value']['conf'] #Prepare the submodule to take parameters from the loop instead of from the user embed_context = copy.copy(context) embed_context.submodule = True for item in _INPUT: if loop_with: inp = item[loop_with] #todo: get_value here? else: inp = item #Pass any input parameters into the submodule embed_context.inputs = {} for k in embed_conf: embed_context.inputs[k] = unicode( util.get_value(embed_conf[k], item)) p = embed(embed_context, [inp], embed_conf) #prepare the submodule results = None try: #loop over the submodule, emitting as we go or collecting results for later assignment for i in p: if assign_part == 'first': if mode == 'EMIT': yield i else: results = i break else: #all if mode == 'EMIT': yield i else: if results: results.append(i) else: results = [i] except HTTPError: #todo any other errors we want to continue looping after? if context.verbose: print "Submodule gave HTTPError - continuing the loop" continue if mode == 'assign': if results and len(results) == 1: results = results[0] util.set_value(item, assign_to, results) yield item elif mode == 'EMIT': pass #already yielded else: raise Exception("Invalid mode %s (expecting assign or EMIT)" % mode)
def sub_fields(matchobj): return util.get_value({'subkey': matchobj.group(1)}, item)
def pipe_fetchpage(context, _INPUT, conf, **kwargs): """Fetch Page module _INPUT -- not used since this does not have inputs. conf: URL -- url object contain the URL to download from -- string from where to start the input to -- string to limit the input token -- if present, split the input on this token to generate items Description: http://pipes.yahoo.com/pipes/docs?doc=sources#FetchPage TODOS: - don't retrieve pages larger than 200k - don't retrieve if page is not indexable. - item delimiter removes the closing tag if using a HTML tag (not documented but happens) - items should be cleaned, i.e. stripped of HTML tags """ urls = conf['URL'] if not isinstance(urls, list): urls = [urls] for item in _INPUT: for item_url in urls: url = util.get_value(item_url, item, **kwargs) if context.verbose: print "FetchPage: Preparing to download:", url try: request = urllib2.Request(url) request.add_header('User-Agent', 'Yahoo Pipes 1.0') request = urllib2.build_opener().open(request) content = unicode( request.read(), request.headers['content-type'].split('charset=')[-1]) # TODO it seems that Yahoo! converts relative links to absolute # TODO this needs to be done on the content but seems to be a non-trival # TODO task python? if context.verbose: print "............FetchPage: content ................." print content.encode("utf-8") print "............FetchPage: EOF ................." from_delimiter = util.get_value(conf["from"], _INPUT, **kwargs) to_delimiter = util.get_value(conf["to"], _INPUT, **kwargs) split_token = util.get_value(conf["token"], _INPUT, **kwargs) # determine from location, i.e. from where to start reading content from_location = 0 if from_delimiter != "": from_location = content.find(from_delimiter) # Yahoo! does not strip off the from_delimiter. #if from_location > 0: # from_location += len(from_delimiter) # determine to location, i.e. where to stop reading content to_location = 0 if to_delimiter != "": to_location = content.find(to_delimiter, from_location) # reduce the content depended on the to/from locations if from_location > 0 and to_location > 0: content = content[from_location:to_location] elif from_location > 0: content = content[from_location:] elif to_location > 0: content = content[:to_location] # determine items depended on the split_token res_items = [] if split_token != "": res_items = content.split(split_token) else: res_items = [content] if context.verbose: print "FetchPage: found count items:", len(res_items) for res_item in res_items: if context.verbose: print "--------------item data --------------------" print res_item print "--------------EOF item data ----------------" yield {"content": res_item} except Exception, e: if context.verbose: print "FetchPage: failed to retrieve from:", url print "----------------- FetchPage -----------------" import traceback traceback.print_exc() print "----------------- FetchPage -----------------" raise if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs): """This operator loops over the input performing the embedded submodule. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: mode -- how to affect output - either assign or EMIT assign_to -- if mode is assign, which field to assign to (new or existing) loop_with -- pass a particular field into the submodule rather than the whole item embed -- embedded submodule Yields (_OUTPUT): source items after passing through the submodule and adding/replacing values """ mode = conf['mode']['value'] assign_to = conf['assign_to']['value'] assign_part = conf['assign_part']['value'] emit_part = conf['emit_part']['value'] loop_with = conf['with']['value'] embed_conf = conf['embed']['value']['conf'] #Prepare the submodule to take parameters from the loop instead of from the user embed_context = copy.copy(context) embed_context.submodule = True for item in _INPUT: if loop_with: inp = util.get_subkey(loop_with, item) else: inp = item #Pass any input parameters into the submodule embed_context.inputs = {} for k in embed_conf: embed_context.inputs[k] = unicode( util.get_value(embed_conf[k], item)) p = embed(embed_context, [inp], embed_conf) #prepare the submodule results = None try: #loop over the submodule, emitting as we go or collecting results for later assignment for i in p: if assign_part == 'first': if mode == 'EMIT': yield i else: results = i break else: #all if mode == 'EMIT': yield i else: if results: results.append(i) else: results = [i] if results and mode == 'assign': #this is a hack to make sure fetchpage works in an out of a loop while not disturbing strconcat in a loop etc. #(goes with the comment below about checking the delivery capability of the source) if len(results) == 1 and isinstance(results[0], dict): results = [results] except HTTPError: #todo any other errors we want to continue looping after? if context.verbose: print "Submodule gave HTTPError - continuing the loop" continue if mode == 'assign': if results and len( results ) == 1: #note: i suspect this needs to be more discerning and only happen if the source can only ever deliver 1 result, e.g. strconcat vs. fetchpage results = results[0] util.set_value(item, assign_to, results) yield item elif mode == 'EMIT': pass #already yielded else: raise Exception("Invalid mode %s (expecting assign or EMIT)" % mode)
def pipe_xpathfetchpage(context, _INPUT, conf, **kwargs): """XPath Fetch Page module _INPUT -- not used since this does not have inputs. conf: URL -- url object contain the URL to download xpath -- xpath to extract html5 -- use html5 parser? useAsString -- emit items as string? Description: http://pipes.yahoo.com/pipes/docs?doc=sources#XPathFetchPage TODOS: - don't retrieve pages larger than 1.5MB - don't retrieve if page is not indexable. """ urls = conf['URL'] if not isinstance(urls, list): urls = [urls] for item in _INPUT: for item_url in urls: url = util.get_value(item_url, item, **kwargs) if context.verbose: print "XPathFetchPage: Preparing to download:", url try: request = urllib2.Request(url) request.add_header('User-Agent', 'Yahoo Pipes 1.0') request = urllib2.build_opener().open(request) content = unicode( request.read(), request.headers['content-type'].split('charset=')[-1]) # TODO it seems that Yahoo! converts relative links to absolute # TODO this needs to be done on the content but seems to be a non-trival # TODO task python? xpath = util.get_value(conf["xpath"], _INPUT, **kwargs) html5 = False useAsString = False if "html5" in conf: html5 = util.get_value(conf["html5"], _INPUT, **kwargs) == "true" if "useAsString" in conf: useAsString = util.get_value(conf["useAsString"], _INPUT, **kwargs) == "true" if html5: #from lxml.html import html5parser #root = html5parser.fromstring(content) from html5lib import parse root = parse(content, treebuilder='lxml', namespaceHTMLElements=False) else: from lxml import etree root = etree.HTML(content) res_items = root.xpath(xpath) if context.verbose: print "XPathFetchPage: found count items:", len(res_items) for res_item in res_items: i = util.etree_to_pipes( res_item) #TODO xml_to_dict(res_item) if context.verbose: print "--------------item data --------------------" print i print "--------------EOF item data ----------------" if useAsString: yield {"content": unicode(i)} else: yield i except Exception, e: if context.verbose: print "XPathFetchPage: failed to retrieve from:", url print "----------------- XPathFetchPage -----------------" import traceback traceback.print_exc() print "----------------- XPathFetchPage -----------------" raise if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once break