def gotTagStart(self, name, attributes): # logger.debug('%s<%s>', ' ' * self.indentlevel, name) self.indentlevel += 2 parent = self._getparent() parent = self._check_parent(parent, name) unesc_attributes = unescape_dict(attributes) namespaces = self.nsstack[-1][0] newspaces = dict(self._gen_newspaces(unesc_attributes)) new_unesc_attributes = dict(self._gen_new_attrs(unesc_attributes)) new_namespaces = merge([namespaces, newspaces]) gen_attr_args = (new_unesc_attributes, new_namespaces) new_attributes = dict(self._gen_attrs(*gen_attr_args)) el_args = (name, new_attributes, parent, self.filename, self.saveMark()) kwargs = { 'case_insensitive': self.case_insensitive, 'namespace': new_namespaces.get('')} el = Element(*el_args, **kwargs) revspaces = invert_dict(newspaces) el.addPrefixes(revspaces) if newspaces: rscopy = merge([self.nsstack[-1][2], revspaces]) self.nsstack.append((new_namespaces, el, rscopy)) self.elementstack.append(el) if parent: parent.appendChild(el) if (self.lenient and el.tagName in self.soonClosers): self.gotTagEnd(name)
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('spreadsheet.csv') >>> conf = { ... 'url': url, 'sanitize': True, 'skip_rows': 0, ... 'encoding': ENCODING} >>> objconf = Objectify(conf) >>> result = parser(None, objconf, stream={}) >>> next(result)['mileage'] == '7213' True """ if skip: stream = kwargs['stream'] else: first_row, custom_header = objconf.skip_rows, objconf.col_names renamed = {'first_row': first_row, 'custom_header': custom_header} f = fetch(decode=True, **objconf) rkwargs = merge([objconf, renamed]) stream = auto_close(read_csv(f, **rkwargs), f) return stream
def meta_reducer(item, rules): field = rules[0]['field'] word = item.get(field, **kwargs) grouped = group_by(rules, 'flags') group_rules = [g[1] for g in grouped] if multi else rules reducer = multi_substitute if multi else substitute replacement = reduce(reducer, group_rules, word) return DotDict(merge([item, {field: replacement}]))
def wrapper(item=None, **kwargs): module_name = wrapper.__module__.split('.')[-1] defaults = { 'dictize': True, 'ftype': 'pass', 'ptype': 'pass', 'objectify': True } combined = merge([self.defaults, defaults, self.opts, kwargs]) is_source = combined['ftype'] == 'none' def_assign = 'content' if is_source else module_name extracted = 'extract' in combined pdictize = combined.get('listize') if extracted else True combined.setdefault('assign', def_assign) combined.setdefault('emit', is_source) combined.setdefault('pdictize', pdictize) conf = {k: combined[k] for k in self.defaults} conf.update(kwargs.get('conf', {})) combined.update({'conf': conf}) uconf = DotDict(conf) if combined.get('dictize') else conf updates = {'conf': uconf, 'assign': combined.get('assign')} kwargs.update(updates) item = item or {} _input = DotDict(item) if combined.get('dictize') else item bfuncs = get_broadcast_funcs(**combined) skip = get_skip(_input, **combined) types = set([]) if skip else {combined['ftype'], combined['ptype']} if types.difference({'pass', 'none'}): dfuncs = get_dispatch_funcs(**combined) else: dfuncs = None parsed, orig_item = _dispatch(_input, bfuncs, dfuncs=dfuncs) kwargs.update({'skip': skip, 'stream': orig_item}) if self. async: stream = yield pipe(*parsed, **kwargs) else: stream = pipe(*parsed, **kwargs) one, assignment = get_assignment(stream, skip=skip, **combined) if skip or combined.get('emit'): stream = assignment elif not skip: stream = assign(_input, assignment, one=one, **combined) if self. async: return_value(stream) else: for s in stream: yield s
def async_reducer(item, rules): field = rules[0]['field'] word = item.get(field, **kwargs) grouped = group_by(rules, 'flags') group_rules = [g[1] for g in grouped] if multi else rules reducer = multi_substitute if multi else substitute replacement = yield ait.coop_reduce(reducer, group_rules, word) combined = merge([item, {field: replacement}]) return_value(DotDict(combined))
def wrapper(item=None, **kwargs): module_name = wrapper.__module__.split('.')[-1] defaults = { 'dictize': True, 'ftype': 'pass', 'ptype': 'pass', 'objectify': True} combined = merge([self.defaults, defaults, self.opts, kwargs]) is_source = combined['ftype'] == 'none' def_assign = 'content' if is_source else module_name extracted = 'extract' in combined pdictize = combined.get('listize') if extracted else True combined.setdefault('assign', def_assign) combined.setdefault('emit', is_source) combined.setdefault('pdictize', pdictize) conf = {k: combined[k] for k in self.defaults} conf.update(kwargs.get('conf', {})) combined.update({'conf': conf}) uconf = DotDict(conf) if combined.get('dictize') else conf updates = {'conf': uconf, 'assign': combined.get('assign')} kwargs.update(updates) item = item or {} _input = DotDict(item) if combined.get('dictize') else item bfuncs = get_broadcast_funcs(**combined) skip = get_skip(_input, **combined) types = set([]) if skip else {combined['ftype'], combined['ptype']} if types.difference({'pass', 'none'}): dfuncs = get_dispatch_funcs(**combined) else: dfuncs = None parsed, orig_item = _dispatch(_input, bfuncs, dfuncs=dfuncs) kwargs.update({'skip': skip, 'stream': orig_item}) if self.async: stream = yield pipe(*parsed, **kwargs) else: stream = pipe(*parsed, **kwargs) one, assignment = get_assignment(stream, skip=skip, **combined) if skip or combined.get('emit'): stream = assignment elif not skip: stream = assign(_input, assignment, one=one, **combined) if self.async: return_value(stream) else: for s in stream: yield s
def memoize(*args, **kwargs): _cache_type = kwargs.pop('cache_type', 'simple') namespace = kwargs.pop('namespace', DEF_NS) cache_type = get_cache_type() if _cache_type == 'auto' else _cache_type config = merge([MEMOIZE_DEFAULTS, CACHE_CONFIGS[cache_type]]) if 'CACHE_TIMEOUT' in kwargs: config['CACHE_TIMEOUT'] = kwargs.pop('CACHE_TIMEOUT') if 'CACHE_THRESHOLD' in kwargs: config['CACHE_THRESHOLD'] = kwargs.pop('CACHE_THRESHOLD') cache = Cache(namespace=namespace, **config) return cache.memoize(*args, **kwargs)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x)['mileage']) ... url = get_path('spreadsheet.csv') ... conf = { ... 'url': url, 'sanitize': True, 'skip_rows': 0, ... 'encoding': ENCODING} ... objconf = Objectify(conf) ... d = async_parser(None, objconf, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... 7213 """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) r = yield io.async_url_open(url) first_row, custom_header = objconf.skip_rows, objconf.col_names renamed = {'first_row': first_row, 'custom_header': custom_header} rkwargs = merge([objconf, renamed]) stream = auto_close(read_csv(r, **rkwargs), r) return_value(stream)
def parse_conf(item, **kwargs): kw = Objectify(kwargs, defaults={}, conf={}) # TODO: fix so .items() returns a DotDict instance # parsed = {k: get_value(item, v) for k, v in kw.conf.items()} sentinel = {'subkey', 'value', 'terminal'} not_dict = not hasattr(kw.conf, 'keys') if not_dict or (len(kw.conf) == 1 and sentinel.intersection(kw.conf)): objectified = get_value(item, **kwargs) else: no_conf = remove_keys(kwargs, 'conf') parsed = {k: get_value(item, kw.conf[k], **no_conf) for k in kw.conf} result = merge([kw.defaults, parsed]) objectified = Objectify(result) if kw.objectify else result return objectified
def assign(item, assignment, key, one=False): value = next(assignment) if one else list(assignment) yield DotDict(merge([item, {key: value}]))
def test_merge(self): expected = {'a': 1, 'b': 10, 'c': 11} result = pr.merge([{'a': 1, 'b': 2}, {'b': 10, 'c': 11}]) nt.assert_equal(expected, result) # setup records = [{'a': 1, 'b': 2, 'c': 3}, {'b': 4, 'c': 5, 'd': 6}] # Combine all keys expected = {u'a': 1, u'c': 8, u'b': 6, u'd': 6} result = pr.merge(records, pred=bool, op=sum) nt.assert_equal(expected, result) first = lambda pair: next(filter(partial(is_not, None), pair)) kwargs = {'pred': bool, 'op': first, 'default': None} expected = {u'a': 1, u'b': 2, u'c': 3, u'd': 6} result = pr.merge(records, **kwargs) nt.assert_equal(expected, result) # This will only reliably give the expected result for 2 records kwargs = {'pred': bool, 'op': stats.mean, 'default': None} expected = {u'a': 1, u'b': 3.0, u'c': 4.0, u'd': 6.0} result = pr.merge(records, **kwargs) nt.assert_equal(expected, result) # Only combine key 'b' expected = {u'a': 1, u'b': 6, u'c': 5, u'd': 6} result = pr.merge(records, pred='b', op=sum) nt.assert_equal(expected, result) # Only combine keys that have the same value of 'b' expected = {u'a': 1, u'b': 6, u'c': 5, u'd': 6} result = pr.merge(records, pred=itemgetter('b'), op=sum) nt.assert_equal(expected, result) # This will reliably work for any number of records counted = defaultdict(int) records = [ {'a': 1, 'b': 4, 'c': 0}, {'a': 2, 'b': 5, 'c': 2}, {'a': 3, 'b': 6, 'd': 7}] for r in records: for k in r.keys(): counted[k] += 1 expected = {u'a': 3, u'b': 3, u'c': 2, u'd': 1} nt.assert_equal(expected, counted) summed = pr.merge(records, pred=bool, op=sum) expected = {u'a': 6, u'b': 15, u'c': 2, u'd': 7} nt.assert_equal(expected, summed) kwargs = {'pred': bool, 'op': ft.fpartial(truediv)} expected = {u'a': 2.0, u'b': 5.0, u'c': 1.0, u'd': 7.0} result = pr.merge([summed, counted], **kwargs) nt.assert_equal(expected, result) # This should also reliably work for any number of records op = ft.fpartial(ft.sum_and_count) kwargs = {'pred': bool, 'op': op, 'default': None} merged = pr.merge(records, **kwargs) result = {x: truediv(*y) for x, y in merged.items()} nt.assert_equal(expected, result)
def assign(item, assignment, **kwargs): key = kwargs.get('assign') value = next(assignment) if kwargs.get('one') else list(assignment) merged = merge([item, {key: value}]) yield DotDict(merged) if kwargs.get('dictize') else merged
def test_merge(self): expected = {"a": 1, "b": 10, "c": 11} result = pr.merge([{"a": 1, "b": 2}, {"b": 10, "c": 11}]) nt.assert_equal(expected, result) # setup records = [{"a": 1, "b": 2, "c": 3}, {"b": 4, "c": 5, "d": 6}] # Combine all keys expected = {u"a": 1, u"c": 8, u"b": 6, u"d": 6} result = pr.merge(records, pred=bool, op=sum) nt.assert_equal(expected, result) first = lambda pair: next(filter(partial(is_not, None), pair)) kwargs = {"pred": bool, "op": first, "default": None} expected = {u"a": 1, u"b": 2, u"c": 3, u"d": 6} result = pr.merge(records, **kwargs) nt.assert_equal(expected, result) # This will only reliably give the expected result for 2 records kwargs = {"pred": bool, "op": stats.mean, "default": None} expected = {u"a": 1, u"b": 3.0, u"c": 4.0, u"d": 6.0} result = pr.merge(records, **kwargs) nt.assert_equal(expected, result) # Only combine key 'b' expected = {u"a": 1, u"b": 6, u"c": 5, u"d": 6} result = pr.merge(records, pred="b", op=sum) nt.assert_equal(expected, result) # Only combine keys that have the same value of 'b' expected = {u"a": 1, u"b": 6, u"c": 5, u"d": 6} result = pr.merge(records, pred=itemgetter("b"), op=sum) nt.assert_equal(expected, result) # This will reliably work for any number of records counted = defaultdict(int) records = [ { "a": 1, "b": 4, "c": 0 }, { "a": 2, "b": 5, "c": 2 }, { "a": 3, "b": 6, "d": 7 }, ] for r in records: for k in r.keys(): counted[k] += 1 expected = {u"a": 3, u"b": 3, u"c": 2, u"d": 1} nt.assert_equal(expected, counted) summed = pr.merge(records, pred=bool, op=sum) expected = {u"a": 6, u"b": 15, u"c": 2, u"d": 7} nt.assert_equal(expected, summed) kwargs = {"pred": bool, "op": ft.fpartial(truediv)} expected = {u"a": 2.0, u"b": 5.0, u"c": 1.0, u"d": 7.0} result = pr.merge([summed, counted], **kwargs) nt.assert_equal(expected, result) # This should also reliably work for any number of records op = ft.fpartial(ft.sum_and_count) kwargs = {"pred": bool, "op": op, "default": None} merged = pr.merge(records, **kwargs) result = {x: truediv(*y) for x, y in merged.items()} nt.assert_equal(expected, result)
def getpipe(args, pipe=SyncPipe): source, sleep = args ptype = source.get('type', 'fetch') conf = merge([{'sleep': sleep}, source]) return pipe(ptype, conf=conf).list
def wrapper(items=None, **kwargs): module_name = wrapper.__module__.split('.')[-1] wrapper.__dict__['name'] = module_name defaults = { 'dictize': True, 'ftype': 'pass', 'ptype': 'pass', 'objectify': True, 'emit': True, 'assign': module_name} combined = merge([self.defaults, defaults, self.opts, kwargs]) extracted = 'extract' in combined pdictize = combined.get('listize') if extracted else True combined.setdefault('pdictize', pdictize) conf = {k: combined[k] for k in self.defaults} conf.update(kwargs.get('conf', {})) combined.update({'conf': conf}) uconf = DotDict(conf) if combined.get('dictize') else conf updates = {'conf': uconf, 'assign': combined.get('assign')} kwargs.update(updates) items = items or iter([]) _INPUT = map(DotDict, items) if combined.get('dictize') else items bfuncs = get_broadcast_funcs(**combined) types = {combined['ftype'], combined['ptype']} if types.difference({'pass', 'none'}): dfuncs = get_dispatch_funcs(**combined) else: dfuncs = None pairs = (_dispatch(item, bfuncs, dfuncs=dfuncs) for item in _INPUT) parsed, _ = _dispatch(DotDict(), bfuncs, dfuncs=dfuncs) # - operators can't skip items # - purposely setting both variables to maps of the same iterable # since only one is intended to be used at any given time # - `tuples` is an iterator of tuples of the first two `parsed` # elements tuples = ((p[0][0], p[0][1]) for p in pairs) orig_stream = (p[0][0] for p in pairs) objconf = parsed[1] if self.async: stream = yield pipe(orig_stream, objconf, tuples, **kwargs) else: stream = pipe(orig_stream, objconf, tuples, **kwargs) sub_type = 'aggregator' if hasattr(stream, 'keys') else 'composer' wrapper.__dict__['sub_type'] = sub_type # operators can only assign one value per item and can't skip items _, assignment = get_assignment(stream, **combined) if combined.get('emit'): stream = assignment else: singles = (iter([v]) for v in assignment) assigned = ( assign({}, s, one=True, **combined) for s in singles) stream = multiplex(assigned) if self.async: return_value(stream) else: for s in stream: yield s
def getpipe(args, pipe=SyncPipe): source, conf = args ptype = source.get('type', 'fetch') return pipe(ptype, conf=merge([conf, source])).output
def wrapper(items=None, **kwargs): module_name = wrapper.__module__.split('.')[-1] wrapper.__dict__['name'] = module_name defaults = { 'dictize': True, 'ftype': 'pass', 'ptype': 'pass', 'objectify': True, 'emit': True, 'assign': module_name} combined = merge([self.defaults, defaults, self.opts, kwargs]) extracted = 'extract' in combined pdictize = combined.get('listize') if extracted else True combined.setdefault('pdictize', pdictize) conf = {k: combined[k] for k in self.defaults} conf.update(kwargs.get('conf', {})) combined.update({'conf': conf}) # replace conf with dictized version so we can access its # attributes even if we already extracted a value updates = {'conf': DotDict(conf), 'assign': combined.get('assign')} kwargs.update(updates) items = items or iter([]) _INPUT = map(DotDict, items) if combined.get('dictize') else items bfuncs = get_broadcast_funcs(**combined) types = {combined['ftype'], combined['ptype']} if types.difference({'pass', 'none'}): dfuncs = get_dispatch_funcs(**combined) else: dfuncs = None pairs = (_dispatch(item, bfuncs, dfuncs=dfuncs) for item in _INPUT) parsed, _ = _dispatch(DotDict(), bfuncs, dfuncs=dfuncs) # - operators can't skip items # - purposely setting both variables to maps of the same iterable # since only one is intended to be used at any given time # - `tuples` is an iterator of tuples of the first two `parsed` # elements tuples = ((p[0][0], p[0][1]) for p in pairs) orig_stream = (p[0][0] for p in pairs) objconf = parsed[1] if self.async: stream = yield pipe(orig_stream, objconf, tuples, **kwargs) else: stream = pipe(orig_stream, objconf, tuples, **kwargs) sub_type = 'aggregator' if hasattr(stream, 'keys') else 'composer' wrapper.__dict__['sub_type'] = sub_type # operators can only assign one value per item and can't skip items _, assignment = get_assignment(stream, **combined) if combined.get('emit'): stream = assignment else: singles = (iter([v]) for v in assignment) key = combined.get('assign') assigned = (assign({}, s, key, one=True) for s in singles) stream = multiplex(assigned) if self.async: return_value(stream) else: for s in stream: yield s
def parser(stream, objconf, tuples, **kwargs): """ Parses the pipe content Args: stream (Iter[dict]): The source. Note: this shares the `tuples` iterator, so consuming it will consume `tuples` as well. objconf (obj): The pipe configuration (an Objectify instance) tuples (Iter[(dict, obj)]): Iterable of tuples of (item, objconf) `item` is an element in the source stream and `objconf` is the item configuration (an Objectify instance). Note: this shares the `stream` iterator, so consuming it will consume `stream` as well. kwargs (dict): Keyword arguments. Kwargs: other (Iter[dict]): stream to join Returns: Iter(dict): The output stream Examples: >>> from itertools import repeat >>> from meza.fntools import Objectify >>> >>> stream = ({'x': 'foo', 'sum': x} for x in range(5)) >>> other = ({'x': 'foo', 'count': x + 5} for x in range(5)) >>> objconf = Objectify({}) >>> tuples = zip(stream, repeat(objconf)) >>> joined = parser(stream, objconf, tuples, other=other) >>> next(joined) == {'x': 'foo', 'sum': 0, 'count': 5} True >>> len(list(joined)) 24 >>> objconf = Objectify({'join_key': 'x', 'other_join_key': 'y'}) >>> stream = ({'x': 'foo-%s' % x, 'sum': x} for x in range(5)) >>> other = ({'y': 'foo-%s' % x, 'count': x + 5} for x in range(5)) >>> tuples = zip(stream, repeat(objconf)) >>> joined = parser(stream, objconf, tuples, other=other) >>> next(joined) == {'count': 5, 'x': 'foo-0', 'sum': 0, 'y': 'foo-0'} True >>> len(list(joined)) 4 """ def compare(x, y): if objconf.lower: x_value, y_value = x.get(x_key, ''), y.get(y_key, '') equal = x_value.lower() == y_value.lower() else: equal = x.get(x_key) == y.get(y_key) return equal if objconf.join_key or objconf.other_join_key: x_key = objconf.join_key or objconf.other_join_key y_key = objconf.other_join_key or x_key prod = product(stream, kwargs['other']) joined = (merge([x, y]) for x, y in prod if compare(x, y)) else: joined = join(stream, kwargs['other']) return joined
def test_merge(self): expected = {'a': 1, 'b': 10, 'c': 11} result = pr.merge([{'a': 1, 'b': 2}, {'b': 10, 'c': 11}]) nt.assert_equal(expected, result) # setup records = [{'a': 1, 'b': 2, 'c': 3}, {'b': 4, 'c': 5, 'd': 6}] # Combine all keys expected = {u'a': 1, u'c': 8, u'b': 6, u'd': 6} result = pr.merge(records, pred=bool, op=sum) nt.assert_equal(expected, result) first = lambda pair: next(filter(partial(is_not, None), pair)) kwargs = {'pred': bool, 'op': first, 'default': None} expected = {u'a': 1, u'b': 2, u'c': 3, u'd': 6} result = pr.merge(records, **kwargs) nt.assert_equal(expected, result) # This will only reliably give the expected result for 2 records kwargs = {'pred': bool, 'op': stats.mean, 'default': None} expected = {u'a': 1, u'b': 3.0, u'c': 4.0, u'd': 6.0} result = pr.merge(records, **kwargs) nt.assert_equal(expected, result) # Only combine key 'b' expected = {u'a': 1, u'b': 6, u'c': 5, u'd': 6} result = pr.merge(records, pred='b', op=sum) nt.assert_equal(expected, result) # Only combine keys that have the same value of 'b' expected = {u'a': 1, u'b': 6, u'c': 5, u'd': 6} result = pr.merge(records, pred=itemgetter('b'), op=sum) nt.assert_equal(expected, result) # This will reliably work for any number of records counted = defaultdict(int) records = [{ 'a': 1, 'b': 4, 'c': 0 }, { 'a': 2, 'b': 5, 'c': 2 }, { 'a': 3, 'b': 6, 'd': 7 }] for r in records: for k in r.keys(): counted[k] += 1 expected = {u'a': 3, u'b': 3, u'c': 2, u'd': 1} nt.assert_equal(expected, counted) summed = pr.merge(records, pred=bool, op=sum) expected = {u'a': 6, u'b': 15, u'c': 2, u'd': 7} nt.assert_equal(expected, summed) kwargs = {'pred': bool, 'op': ft.fpartial(truediv)} expected = {u'a': 2.0, u'b': 5.0, u'c': 1.0, u'd': 7.0} result = pr.merge([summed, counted], **kwargs) nt.assert_equal(expected, result) # This should also reliably work for any number of records op = ft.fpartial(ft.sum_and_count) kwargs = {'pred': bool, 'op': op, 'default': None} merged = pr.merge(records, **kwargs) result = {x: truediv(*y) for x, y in merged.items()} nt.assert_equal(expected, result)
def reducer(item, rule): new_dict = {rule.newval: item.get(rule.field)} if rule.newval else {} old_dict = item if rule.copy else remove_keys(item, rule.field) return DotDict(merge([old_dict, new_dict]))