def wrapper(item=None, **kwargs): module_name = wrapper.__module__.split('.')[-1] defaults = { 'dictize': True, 'ftype': 'pass', 'ptype': 'pass', 'objectify': True } combined = merge([self.defaults, defaults, self.opts, kwargs]) is_source = combined['ftype'] == 'none' def_assign = 'content' if is_source else module_name extracted = 'extract' in combined pdictize = combined.get('listize') if extracted else True combined.setdefault('assign', def_assign) combined.setdefault('emit', is_source) combined.setdefault('pdictize', pdictize) conf = {k: combined[k] for k in self.defaults} conf.update(kwargs.get('conf', {})) combined.update({'conf': conf}) uconf = DotDict(conf) if combined.get('dictize') else conf updates = {'conf': uconf, 'assign': combined.get('assign')} kwargs.update(updates) item = item or {} _input = DotDict(item) if combined.get('dictize') else item bfuncs = get_broadcast_funcs(**combined) skip = get_skip(_input, **combined) types = set([]) if skip else {combined['ftype'], combined['ptype']} if types.difference({'pass', 'none'}): dfuncs = get_dispatch_funcs(**combined) else: dfuncs = None parsed, orig_item = _dispatch(_input, bfuncs, dfuncs=dfuncs) kwargs.update({'skip': skip, 'stream': orig_item}) if self. async: stream = yield pipe(*parsed, **kwargs) else: stream = pipe(*parsed, **kwargs) one, assignment = get_assignment(stream, skip=skip, **combined) if skip or combined.get('emit'): stream = assignment elif not skip: stream = assign(_input, assignment, one=one, **combined) if self. async: return_value(stream) else: for s in stream: yield s
def gen_records(src, *paths, report_date=None, blacklist=None, **kwargs): data = DotDict(loads(src)) change = kwargs.get("change") or {} nested_path = kwargs.get("nested_path", "") report_datetime = dt.strptime(report_date, S3_DATE_FORMAT) try: path, subpath = paths except ValueError: path, subpath = paths[0], None records = data.get(path, []) if records and kwargs.get("listize"): records = [records] for record in records: record["date"] = report_datetime.isoformat() if subpath: if "." in subpath: subpath_0, subpath_1 = subpath.split(".", maxsplit=1) reference_record = dfilter(record, blacklist + [subpath_0]) else: reference_record = dfilter(record, blacklist + [subpath]) for new_record in DotDict(record)[subpath]: combined = {**new_record, **reference_record} clean_record = dfilter(combined, blacklist) yield {change.get(k, k): v for k, v in clean_record.items()} elif nested_path: # key is like 'race-7-description' keyfunc = lambda x: "-".join(re.findall(r"\d+", x[0])) reference_record = dfilter(record, [nested_path]) nested = record.get(nested_path) if nested: flattened = flatten(nested) for key, group in groupby(flattened, keyfunc): new_record = {re.sub(r"\d+-", "", k): v for k, v in group} combined = {**new_record, **reference_record} clean_record = dfilter(combined, blacklist) yield { change.get(k, k): v for k, v in clean_record.items() } else: clean_record = dfilter(record, blacklist) yield {change.get(k, k): v for k, v in clean_record.items()}
def parser(item, objconf, skip=False, **kwargs): """ Parses the pipe content Args: item (obj): The entry to process (a DotDict instance) objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko.dotdict import DotDict >>> from meza.fntools import Objectify >>> >>> item = DotDict() >>> conf = {'guid': 'a1', 'mediaThumbURL': 'image.png'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': item} >>> result = parser(item, objconf, **kwargs) >>> result == {'media:thumbnail': {'url': 'image.png'}, 'y:id': 'a1'} True """ if skip: stream = kwargs['stream'] else: items = objconf.items() rdict = ((RSS.get(k, k), item.get(v, v, **kwargs)) for k, v in items) stream = DotDict(rdict) return stream
def parser(_, attrs, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored attrs (List[dict]): Attributes skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter(dict): The stream of items Examples: >>> from meza.fntools import Objectify >>> attrs = [ ... {'key': 'title', 'value': 'the title'}, ... {'key': 'desc', 'value': 'the desc'}] >>> result = parser(None, map(Objectify, attrs)) >>> result == {'title': 'the title', 'desc': 'the desc'} True """ items = ((a.key, a.value) for a in attrs) return kwargs['stream'] if skip else DotDict(items)
def meta_reducer(item, rules): field = rules[0]['field'] word = item.get(field, **kwargs) grouped = group_by(rules, 'flags') group_rules = [g[1] for g in grouped] if multi else rules reducer = multi_substitute if multi else substitute replacement = reduce(reducer, group_rules, word) return DotDict(merge([item, {field: replacement}]))
def async_reducer(item, rules): field = rules[0]['field'] word = item.get(field, **kwargs) grouped = group_by(rules, 'flags') group_rules = [g[1] for g in grouped] if multi else rules reducer = multi_substitute if multi else substitute replacement = yield ait.coop_reduce(reducer, group_rules, word) combined = merge([item, {field: replacement}]) return_value(DotDict(combined))
def parser(item, params, skip=False, **kwargs): """ Parsers the pipe content Args: item (obj): The entry to process (a DotDict instance) params (List[dict]): Query parameters skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: dict: The item Examples: >>> from meza.fntools import Objectify >>> >>> item = DotDict() >>> params = {'key': 's', 'value': 'gm'} >>> path = [{'value': 'rss'}, {'value': 'headline'}] >>> base = 'http://finance.yahoo.com' >>> conf = {'base': base, 'path': path, 'params': params} >>> kwargs = {'stream': item, 'conf': conf} >>> result = parser(item, [Objectify(params)], **kwargs) >>> sorted(result.keys()) == [ ... 'fragment', 'netloc', 'params', 'path', 'query', 'scheme', ... 'url'] True >>> result['url'] == 'http://finance.yahoo.com/rss/headline?s=gm' True """ if skip: stream = kwargs['stream'] else: conf = kwargs.pop('conf') path = conf.get('path') paths = (get_value(item, DotDict(p), **kwargs) for p in path) params = urlencode([(p.key, p.value) for p in params]) url = '%s?%s' % (urljoin(conf['base'], '/'.join(paths)), params) stream = cast_url(url) return stream
def get_broadcast_funcs(**kwargs): kw = Objectify(kwargs, conf={}) pieces = kw.conf[kw.extract] if kw.extract else kw.conf no_conf = remove_keys(kwargs, 'conf') noop = partial(cast, _type='none') if kw.listize: listed = listize(pieces) piece_defs = map(DotDict, listed) if kw.pdictize else listed parser = partial(parse_conf, **no_conf) pfuncs = [partial(parser, conf=conf) for conf in piece_defs] get_pieces = lambda item: broadcast(item, *pfuncs) elif kw.ptype != 'none': conf = DotDict(pieces) if kw.pdictize and pieces else pieces get_pieces = partial(parse_conf, conf=conf, **no_conf) else: get_pieces = noop ffunc = noop if kw.ftype == 'none' else partial(get_field, **kwargs) return (ffunc, get_pieces)
def reducer(item, rule): new_dict = {rule.newval: item.get(rule.field)} if rule.newval else {} old_dict = item if rule.copy else remove_keys(item, rule.field) return DotDict(merge([old_dict, new_dict]))
def wrapper(items=None, **kwargs): module_name = wrapper.__module__.split('.')[-1] wrapper.__dict__['name'] = module_name defaults = { 'dictize': True, 'ftype': 'pass', 'ptype': 'pass', 'objectify': True, 'emit': True, 'assign': module_name} combined = merge([self.defaults, defaults, self.opts, kwargs]) extracted = 'extract' in combined pdictize = combined.get('listize') if extracted else True combined.setdefault('pdictize', pdictize) conf = {k: combined[k] for k in self.defaults} conf.update(kwargs.get('conf', {})) combined.update({'conf': conf}) # replace conf with dictized version so we can access its # attributes even if we already extracted a value updates = {'conf': DotDict(conf), 'assign': combined.get('assign')} kwargs.update(updates) items = items or iter([]) _INPUT = map(DotDict, items) if combined.get('dictize') else items bfuncs = get_broadcast_funcs(**combined) types = {combined['ftype'], combined['ptype']} if types.difference({'pass', 'none'}): dfuncs = get_dispatch_funcs(**combined) else: dfuncs = None pairs = (_dispatch(item, bfuncs, dfuncs=dfuncs) for item in _INPUT) parsed, _ = _dispatch(DotDict(), bfuncs, dfuncs=dfuncs) # - operators can't skip items # - purposely setting both variables to maps of the same iterable # since only one is intended to be used at any given time # - `tuples` is an iterator of tuples of the first two `parsed` # elements tuples = ((p[0][0], p[0][1]) for p in pairs) orig_stream = (p[0][0] for p in pairs) objconf = parsed[1] if self.async: stream = yield pipe(orig_stream, objconf, tuples, **kwargs) else: stream = pipe(orig_stream, objconf, tuples, **kwargs) sub_type = 'aggregator' if hasattr(stream, 'keys') else 'composer' wrapper.__dict__['sub_type'] = sub_type # operators can only assign one value per item and can't skip items _, assignment = get_assignment(stream, **combined) if combined.get('emit'): stream = assignment else: singles = (iter([v]) for v in assignment) key = combined.get('assign') assigned = (assign({}, s, key, one=True) for s in singles) stream = multiplex(assigned) if self.async: return_value(stream) else: for s in stream: yield s
def assign(item, assignment, key, one=False): value = next(assignment) if one else list(assignment) yield DotDict(merge([item, {key: value}]))
def assign(item, assignment, **kwargs): key = kwargs.get('assign') value = next(assignment) if kwargs.get('one') else list(assignment) merged = merge([item, {key: value}]) yield DotDict(merged) if kwargs.get('dictize') else merged