def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('lorem.txt') >>> objconf = Objectify({'url': url, 'encoding': ENCODING}) >>> result = parser(None, objconf, assign='content') >>> next(result)['content'] == 'What is Lorem Ipsum?' True """ if skip: stream = kwargs['stream'] else: f = fetch(decode=True, **objconf) _stream = ({kwargs['assign']: line.strip()} for line in f) stream = auto_close(_stream, f) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('spreadsheet.csv') >>> conf = { ... 'url': url, 'sanitize': True, 'skip_rows': 0, ... 'encoding': ENCODING} >>> objconf = Objectify(conf) >>> result = parser(None, objconf, stream={}) >>> next(result)['mileage'] == '7213' True """ if skip: stream = kwargs['stream'] else: first_row, custom_header = objconf.skip_rows, objconf.col_names renamed = {'first_row': first_row, 'custom_header': custom_header} f = fetch(decode=True, **objconf) rkwargs = merge([objconf, renamed]) stream = auto_close(read_csv(f, **rkwargs), f) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('lorem.txt') >>> objconf = Objectify({'url': url, 'encoding': ENCODING}) >>> result = parser(None, objconf, assign='content') >>> next(result)['content'] == 'What is Lorem Ipsum?' True """ if skip: stream = kwargs['stream'] else: f = fetch(decode=True, **objconf) _stream = ({kwargs['assign']: line.strip()} for line in f) stream = auto_close(_stream, f) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('spreadsheet.csv') >>> conf = { ... 'url': url, 'sanitize': True, 'skip_rows': 0, ... 'encoding': ENCODING} >>> objconf = Objectify(conf) >>> result = parser(None, objconf, stream={}) >>> next(result)['mileage'] == '7213' True """ if skip: stream = kwargs['stream'] else: first_row, custom_header = objconf.skip_rows, objconf.col_names renamed = {'first_row': first_row, 'custom_header': custom_header} f = fetch(decode=True, **objconf) rkwargs = merge([objconf, renamed]) stream = auto_close(read_csv(f, **rkwargs), f) return stream
def parser(base, objconf, skip=False, **kwargs): """ Parses the pipe content Args: base (str): The base currency (exchanging from) objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: exchangerate) stream (dict): The original item Returns: dict: The item Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('quote.json') >>> conf = {'url': url, 'currency': 'USD', 'delay': 0, 'precision': 6} >>> item = {'content': 'GBP'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': item, 'assign': 'content'} >>> parser(item['content'], objconf, **kwargs) Decimal('1.275201') """ same_currency = base == objconf.currency if skip: rate = kwargs['stream'] elif same_currency: rate = Decimal(1) else: decode = objconf.url.startswith('http') with fetch(decode=decode, **objconf) as f: try: json = next(items(f, '')) except Exception as e: f.seek(0) logger.error('Error parsing {url}'.format(**objconf)) logger.debug(f.read()) logger.error(e) logger.error(traceback.format_exc()) skip = True rate = 0 if not (skip or same_currency): places = Decimal(10) ** -objconf.precision rates = parse_response(json) rate = calc_rate(base, objconf.currency, rates, places=places) return rate
def parser(base, objconf, skip=False, **kwargs): """ Parses the pipe content Args: base (str): The base currency (exchanging from) objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: exchangerate) stream (dict): The original item Returns: dict: The item Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('quote.json') >>> conf = {'url': url, 'currency': 'USD', 'delay': 0, 'precision': 6} >>> item = {'content': 'GBP'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': item, 'assign': 'content'} >>> parser(item['content'], objconf, **kwargs) Decimal('1.275201') """ same_currency = base == objconf.currency if skip: rate = kwargs['stream'] elif same_currency: rate = Decimal(1) else: decode = objconf.url.startswith('http') with fetch(decode=decode, **objconf) as f: try: json = next(items(f, '')) except Exception as e: f.seek(0) logger.error('Error parsing {url}'.format(**objconf)) logger.debug(f.read()) logger.error(e) logger.error(traceback.format_exc()) skip = True rate = 0 if not (skip or same_currency): places = Decimal(10)**-objconf.precision rates = parse_response(json) rate = calc_rate(base, objconf.currency, rates, places=places) return rate
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.utils import get_abspath >>> from meza.fntools import Objectify >>> >>> feed = 'http://feeds.feedburner.com/TechCrunch/' >>> url = 'http://query.yahooapis.com/v1/public/yql' >>> query = "select * from feed where url='%s'" % feed >>> conf = {'query': query, 'url': url, 'debug': False} >>> objconf = Objectify(conf) >>> url = get_abspath(get_path('yql.xml')) >>> >>> with fetch(url) as f: ... kwargs = {'stream': {}, 'response': f} ... result = parser(None, objconf, **kwargs) >>> >>> next(result)['title'] 'Bring pizza home' """ if skip: stream = kwargs['stream'] else: f = kwargs.get('response') if not f: params = {'q': objconf.query, 'diagnostics': objconf.debug} if objconf.memoize and not objconf.cache_type: objconf.cache_type = 'auto' f = fetch(params=params, **objconf) # TODO: consider paging for large result sets root = xml2etree(f).getroot() results = root.find('results') stream = map(etree2dict, results) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.utils import get_abspath >>> from meza.fntools import Objectify >>> >>> feed = 'http://feeds.feedburner.com/TechCrunch/' >>> url = 'http://query.yahooapis.com/v1/public/yql' >>> query = "select * from feed where url='%s'" % feed >>> conf = {'query': query, 'url': url, 'debug': False} >>> objconf = Objectify(conf) >>> url = get_abspath(get_path('yql.xml')) >>> >>> with fetch(url) as f: ... kwargs = {'stream': {}, 'response': f} ... result = parser(None, objconf, **kwargs) >>> >>> next(result)['title'] 'Bring pizza home' """ if skip: stream = kwargs['stream'] else: f = kwargs.get('response') if not f: params = {'q': objconf.query, 'diagnostics': objconf.debug} if objconf.memoize and not objconf.cache_type: objconf.cache_type = 'auto' f = fetch(params=params, **objconf) # TODO: consider paging for large result sets root = xml2etree(f).getroot() results = root.find('results') stream = map(etree2dict, results) return stream
def get_rss(url, convert_charrefs=False): try: parser = LinkParser(convert_charrefs=convert_charrefs) except TypeError: parser = LinkParser() try: f = fetch(url, timeout=TIMEOUT) except ValueError: f = filter(None, url.splitlines()) return file2entries(f, parser)
def get_rss(url, convert_charrefs=False): try: parser = LinkParser(convert_charrefs=convert_charrefs) except TypeError: parser = LinkParser() try: f = fetch(url, timeout=TIMEOUT) except ValueError: f = filter(None, url.splitlines()) return file2entries(f, parser)
def parse_rss(url=None, **kwargs): try: f = fetch(decode(url), **kwargs) except (ValueError, URLError): parsed = rssparser.parse(url) else: content = f.read() if speedparser else f try: parsed = rssparser.parse(content) finally: f.close() return parsed
def parse_rss(url=None, **kwargs): try: f = fetch(decode(url), **kwargs) except (ValueError, URLError): parsed = rssparser.parse(url) else: content = f.read() if speedparser else f try: parsed = rssparser.parse(content) finally: f.close() return parsed
def parser(base, objconf, skip=False, **kwargs): """ Parses the pipe content Args: base (str): The base currency (exchanging from) objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: exchangerate) stream (dict): The original item Returns: dict: The item Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('quote.json') >>> conf = {'url': url, 'currency': 'USD', 'delay': 0, 'precision': 6} >>> item = {'content': 'GBP'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': item, 'assign': 'content'} >>> parser(item['content'], objconf, **kwargs) Decimal('1.545801') """ same_currency = base == objconf.currency if skip: rate = kwargs['stream'] elif same_currency: rate = Decimal(1) else: decode = objconf.url.startswith('http') if objconf.memoize and not objconf.cache_type: objconf.cache_type = 'auto' with fetch(decode=decode, **objconf) as f: json = next(items(f, '')) if not (skip or same_currency): places = Decimal(10)**-objconf.precision rates = parse_response(json) rate = calc_rate(base, objconf.currency, rates, places=places) return rate
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> >>> url = get_path('ouseful.xml') >>> objconf = Objectify({'url': url, 'xpath': '/rss/channel/item'}) >>> result = parser(None, objconf, stream={}) >>> title = 'Running “Native” Data Wrangling Applications' >>> next(result)['title'][:44] == title True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict if objconf.memoize and not objconf.cache_type: objconf.cache_type = 'auto' with fetch(**objconf) as f: root = xml2etree(f, xml=xml, html5=objconf.html5).getroot() elements = xpath(root, objconf.xpath) items = map(etree2dict, elements) stringified = ({kwargs['assign']: str(i)} for i in items) stream = stringified if objconf.stringify else items return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> from meza.compat import decode >>> >>> url = get_path('cnn.html') >>> conf = {'url': url, 'start': '<title>', 'end': '</title>'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': {}, 'assign': 'content'} >>> result = parser(None, objconf, **kwargs) >>> resp = next(result)['content'][:21] >>> decode(resp) == 'CNN.com International' True """ if skip: stream = kwargs['stream'] else: if objconf.memoize and not objconf.cache_type: objconf.cache_type = 'auto' with fetch(decode=True, **objconf) as f: sliced = betwix(f, objconf.start, objconf.end, True) content = '\n'.join(sliced) parsed = get_string(content, objconf.start, objconf.end) detagged = get_text(parsed) if objconf.detag else parsed splits = detagged.split(objconf.token) if objconf.token else [detagged] stream = ({kwargs['assign']: chunk} for chunk in splits) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> from meza.compat import decode >>> >>> url = get_path('cnn.html') >>> conf = {'url': url, 'start': '<title>', 'end': '</title>'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': {}, 'assign': 'content'} >>> result = parser(None, objconf, **kwargs) >>> resp = next(result)['content'][:21] >>> decode(resp) == 'CNN.com International' True """ if skip: stream = kwargs['stream'] else: with fetch(decode=True, **objconf) as f: sliced = betwix(f, objconf.start, objconf.end, True) content = '\n'.join(sliced) parsed = get_string(content, objconf.start, objconf.end) detagged = get_text(parsed) if objconf.detag else parsed splits = detagged.split(objconf.token) if objconf.token else [detagged] stream = ({kwargs['assign']: chunk} for chunk in splits) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('gigs.json') >>> objconf = Objectify({'url': url, 'path': 'value.items'}) >>> result = parser(None, objconf, stream={}) >>> result[0]['title'] == 'Business System Analyst' True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = p.splitext(url)[1].lstrip('.') if objconf.memoize and not objconf.cache_type: objconf.cache_type = 'auto' with fetch(**objconf) as f: stream = any2dict(f, ext, objconf.html5, path=objconf.path) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> >>> url = get_path('ouseful.xml') >>> objconf = Objectify({'url': url, 'xpath': '/rss/channel/item'}) >>> result = parser(None, objconf, stream={}) >>> title = 'Running “Native” Data Wrangling Applications' >>> next(result)['title'][:44] == title True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict with fetch(**objconf) as f: root = xml2etree(f, xml=xml, html5=objconf.html5).getroot() elements = xpath(root, objconf.xpath) items = map(etree2dict, elements) stringified = ({kwargs['assign']: str(i)} for i in items) stream = stringified if objconf.stringify else items return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('gigs.json') >>> objconf = Objectify({'url': url, 'path': 'value.items'}) >>> result = parser(None, objconf, stream={}) >>> result[0]['title'] == 'Business System Analyst' True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = p.splitext(url)[1].lstrip('.') with fetch(**objconf) as f: ext = ext or f.ext stream = any2dict(f, ext, objconf.html5, path=objconf.path) return stream