def pipe_feedautodiscovery(context=None, _INPUT=None, conf=None, **kwargs): """A source that searches for and returns feed links found in a page. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : URL -- url Yields ------ _OUTPUT : items """ conf = DotDict(conf) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) if context and context.verbose: print "pipe_feedautodiscovery loading:", url for entry in autorss.getRSSLink(url.encode('utf-8')): yield {'link': entry} # todo: add rel, type, title if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_fetchsitefeed(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches and parses the first feed found on one or more sites. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : URL -- url Yields ------ _OUTPUT : items """ conf = DotDict(conf) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) if context and context.verbose: print "pipe_fetchsitefeed loading:", url for link in autorss.getRSSLink(url.encode('utf-8')): parsed = speedparser.parse(urlopen(link).read()) for entry in utils.gen_entries(parsed): yield entry if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_yql(context=None, _INPUT=None, conf=None, **kwargs): """A source that issues YQL queries. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : yqlquery -- YQL query # todo: handle envURL Yields ------ _OUTPUT : query results """ # todo: get from a config/env file url = "http://query.yahooapis.com/v1/public/yql" conf = DotDict(conf) query = conf['yqlquery'] for item in _INPUT: item = DotDict(item) yql = utils.get_value(query, item, **kwargs) # note: we use the default format of xml since json loses some # structure # todo: diagnostics=true e.g. if context.test # todo: consider paging for large result sets r = requests.get(url, params={'q': yql}, stream=True) # Parse the response tree = parse(r.raw) if context and context.verbose: print "pipe_yql loading xml:", yql root = tree.getroot() # note: query also has row count results = root.find('results') # Convert xml into generation of dicts for element in results.getchildren(): yield utils.etree_to_dict(element) if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_datebuilder(context=None, _INPUT=None, conf=None, **kwargs): """A date module that converts a text string into a datetime value. Useful as terminal data. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items conf : {'DATE': {'type': 'datetime', 'value': '12/2/2014'}} Yields ------ _OUTPUT : date timetuples """ conf = DotDict(conf) for item in _INPUT: _input = DotDict(item) date = utils.get_value(conf['DATE'], _input, **kwargs).lower() if date.endswith(' day') or date.endswith(' days'): count = int(date.split(' ')[0]) new_date = dt.today() + timedelta(days=count) elif date.endswith(' year') or date.endswith(' years'): count = int(date.split(' ')[0]) new_date = dt.today().replace(year=dt.today().year + count) else: new_date = SWITCH.get(date) if not new_date: new_date = utils.get_date(date) if not new_date: raise Exception('Unrecognized date string: %s' % date) yield new_date.timetuple()
def pipe_csv(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches and parses a csv file to yield items. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : URL -- url skip -- number of header rows to skip col_mode -- column name source: row=header row(s), custom=defined in col_name col_name -- list of custom column names col_row_start -- first column header row col_row_end -- last column header row separator -- column separator Yields ------ _OUTPUT : items Note: Current restrictions: separator must be 1 character assumes every row has exactly the expected number of fields, as defined in the header """ conf = DotDict(conf) conf_sep = conf['separator'] conf_mode = conf['col_mode'] col_name = conf['col_name'] for item in _INPUT: item = DotDict(item) url = utils.get_value(conf['URL'], item, **kwargs) url = utils.get_abspath(url) separator = utils.get_value(conf_sep, item, encode=True, **kwargs) skip = int(utils.get_value(conf['skip'], item, **kwargs)) col_mode = utils.get_value(conf_mode, item, **kwargs) f = urlopen(url) if context and context.verbose: print "pipe_csv loading:", url for i in xrange(skip): f.next() reader = csv.UnicodeReader(f, delimiter=separator) fieldnames = [] if col_mode == 'custom': fieldnames = [DotDict(x).get() for x in col_name] else: fieldnames = _gen_fieldnames(conf, reader, item, **kwargs) for rows in reader: yield dict(zip(fieldnames, rows)) f.close() if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def _gen_fieldnames(conf, reader, item, **kwargs): start = int(utils.get_value(conf['col_row_start'], item, **kwargs)) end = int(utils.get_value(conf['col_row_end'], item, **kwargs)) for i in xrange((end - start) + 1): yield reader.next()
def pipe_fetchpage(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches the content of a given web site as a string. Loopable. context : pipe2py.Context object _INPUT : pipeforever asyncPipe or an iterable of items or fields conf : dict URL -- url object contain the URL to download from -- string from where to start the input to -- string to limit the input token -- if present, split the input on this token to generate items Description: http://pipes.yahoo.com/pipes/docs?doc=sources#FetchPage TODOS: - don't retrieve pages larger than 200k - don't retrieve if page is not indexable. - item delimiter removes the closing tag if using a HTML tag (not documented but happens) - items should be cleaned, i.e. stripped of HTML tags Yields ------ _OUTPUT : items """ conf = DotDict(conf) split_token = conf.get('token', **kwargs) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) if not url: continue f = urlopen(url) # TODO: it seems that Yahoo! converts relative links to # absolute. This needs to be done on the content but seems to # be a non-trival task python? content = unicode(f.read(), 'utf-8') if context and context.verbose: print '............Content .................' print content print '...............EOF...................' parsed = _parse_content(content, conf, **kwargs) items = parsed.split(split_token) if split_token else [parsed] if context and context.verbose: print "FetchPage: found count items:", len(items) for i in items: if context and context.verbose: print "--------------item data --------------------" print i print "--------------EOF item data ----------------" yield {"content": i} if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_csv(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches and parses a csv file to yield items. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : URL -- url skip -- number of header rows to skip col_mode -- column name source: row=header row(s), custom=defined in col_name col_name -- list of custom column names col_row_start -- first column header row col_row_end -- last column header row separator -- column separator Yields ------ _OUTPUT : items Note: Current restrictions: separator must be 1 character assumes every row has exactly the expected number of fields, as defined in the header """ conf = DotDict(conf) conf_sep = conf["separator"] conf_mode = conf["col_mode"] col_name = conf["col_name"] for item in _INPUT: item = DotDict(item) url = utils.get_value(conf["URL"], item, **kwargs) url = utils.get_abspath(url) separator = utils.get_value(conf_sep, item, encode=True, **kwargs) skip = int(utils.get_value(conf["skip"], item, **kwargs)) col_mode = utils.get_value(conf_mode, item, **kwargs) f = urlopen(url) if context and context.verbose: print "pipe_csv loading:", url for i in xrange(skip): f.next() reader = csv.UnicodeReader(f, delimiter=separator) fieldnames = [] if col_mode == "custom": fieldnames = [DotDict(x).get() for x in col_name] else: fieldnames = _gen_fieldnames(conf, reader, item, **kwargs) for rows in reader: yield dict(zip(fieldnames, rows)) f.close() if item.get("forever"): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def _gen_fieldnames(conf, reader, item, **kwargs): start = int(utils.get_value(conf["col_row_start"], item, **kwargs)) end = int(utils.get_value(conf["col_row_end"], item, **kwargs)) for i in xrange((end - start) + 1): yield reader.next()
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches the content of a given website as DOM nodes or a string. Loopable. context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : dict URL -- url object contain the URL to download xpath -- xpath to extract html5 -- use html5 parser? useAsString -- emit items as string? TODOS: - don't retrieve pages larger than 1.5MB - don't retrieve if page is not indexable. Yields ------ _OUTPUT : items """ conf = DotDict(conf) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) f = urlopen(url) # TODO: it seems that Yahoo! converts relative links to # absolute. This needs to be done on the content but seems to # be a non-trival task python? content = unicode(f.read(), 'utf-8') if context and context.verbose: print '............Content .................' print content print '...............EOF...................' xpath = conf.get('xpath', **kwargs) html5 = conf.get('html5', **kwargs) == 'true' use_as_string = conf.get('useAsString', **kwargs) == 'true' tree = html5parser.parse(f) if html5 else html.parse(f) root = tree.getroot() items = root.xpath(xpath) if context and context.verbose: print 'XPathFetchPage: found count items:', len(items) for etree in items: i = utils.etree_to_dict(etree) if context and context.verbose: print '--------------item data --------------------' print i print '--------------EOF item data ----------------' if use_as_string: yield {'content': unicode(i)} else: yield i if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break