Ejemplo n.º 1
0
    def gotTagStart(self, name, attributes):
        # logger.debug('%s<%s>', ' ' * self.indentlevel, name)
        self.indentlevel += 2
        parent = self._getparent()
        parent = self._check_parent(parent, name)

        unesc_attributes = unescape_dict(attributes)
        namespaces = self.nsstack[-1][0]
        newspaces = dict(self._gen_newspaces(unesc_attributes))
        new_unesc_attributes = dict(self._gen_new_attrs(unesc_attributes))
        new_namespaces = merge([namespaces, newspaces])
        gen_attr_args = (new_unesc_attributes, new_namespaces)
        new_attributes = dict(self._gen_attrs(*gen_attr_args))
        el_args = (name, new_attributes, parent, self.filename, self.saveMark())

        kwargs = {
            'case_insensitive': self.case_insensitive,
            'namespace': new_namespaces.get('')}

        el = Element(*el_args, **kwargs)
        revspaces = invert_dict(newspaces)
        el.addPrefixes(revspaces)

        if newspaces:
            rscopy = merge([self.nsstack[-1][2], revspaces])
            self.nsstack.append((new_namespaces, el, rscopy))

        self.elementstack.append(el)

        if parent:
            parent.appendChild(el)

        if (self.lenient and el.tagName in self.soonClosers):
            self.gotTagEnd(name)
Ejemplo n.º 2
0
Archivo: csv.py Proyecto: sottom/riko
def parser(_, objconf, skip=False, **kwargs):
    """ Parses the pipe content

    Args:
        _ (None): Ignored
        objconf (obj): The pipe configuration (an Objectify instance)
        skip (bool): Don't parse the content

    Returns:
        Iter[dict]: The stream of items

    Examples:
        >>> from riko import get_path
        >>> from meza.fntools import Objectify
        >>>
        >>> url = get_path('spreadsheet.csv')
        >>> conf = {
        ...     'url': url, 'sanitize': True, 'skip_rows': 0,
        ...     'encoding': ENCODING}
        >>> objconf = Objectify(conf)
        >>> result = parser(None, objconf, stream={})
        >>> next(result)['mileage'] == '7213'
        True
    """
    if skip:
        stream = kwargs['stream']
    else:
        first_row, custom_header = objconf.skip_rows, objconf.col_names
        renamed = {'first_row': first_row, 'custom_header': custom_header}

        f = fetch(decode=True, **objconf)
        rkwargs = merge([objconf, renamed])
        stream = auto_close(read_csv(f, **rkwargs), f)

    return stream
Ejemplo n.º 3
0
Archivo: csv.py Proyecto: nerevu/riko
def parser(_, objconf, skip=False, **kwargs):
    """ Parses the pipe content

    Args:
        _ (None): Ignored
        objconf (obj): The pipe configuration (an Objectify instance)
        skip (bool): Don't parse the content

    Returns:
        Iter[dict]: The stream of items

    Examples:
        >>> from riko import get_path
        >>> from meza.fntools import Objectify
        >>>
        >>> url = get_path('spreadsheet.csv')
        >>> conf = {
        ...     'url': url, 'sanitize': True, 'skip_rows': 0,
        ...     'encoding': ENCODING}
        >>> objconf = Objectify(conf)
        >>> result = parser(None, objconf, stream={})
        >>> next(result)['mileage'] == '7213'
        True
    """
    if skip:
        stream = kwargs['stream']
    else:
        first_row, custom_header = objconf.skip_rows, objconf.col_names
        renamed = {'first_row': first_row, 'custom_header': custom_header}

        f = fetch(decode=True, **objconf)
        rkwargs = merge([objconf, renamed])
        stream = auto_close(read_csv(f, **rkwargs), f)

    return stream
Ejemplo n.º 4
0
 def meta_reducer(item, rules):
     field = rules[0]['field']
     word = item.get(field, **kwargs)
     grouped = group_by(rules, 'flags')
     group_rules = [g[1] for g in grouped] if multi else rules
     reducer = multi_substitute if multi else substitute
     replacement = reduce(reducer, group_rules, word)
     return DotDict(merge([item, {field: replacement}]))
Ejemplo n.º 5
0
Archivo: regex.py Proyecto: nerevu/riko
 def meta_reducer(item, rules):
     field = rules[0]['field']
     word = item.get(field, **kwargs)
     grouped = group_by(rules, 'flags')
     group_rules = [g[1] for g in grouped] if multi else rules
     reducer = multi_substitute if multi else substitute
     replacement = reduce(reducer, group_rules, word)
     return DotDict(merge([item, {field: replacement}]))
Ejemplo n.º 6
0
        def wrapper(item=None, **kwargs):
            module_name = wrapper.__module__.split('.')[-1]

            defaults = {
                'dictize': True,
                'ftype': 'pass',
                'ptype': 'pass',
                'objectify': True
            }

            combined = merge([self.defaults, defaults, self.opts, kwargs])
            is_source = combined['ftype'] == 'none'
            def_assign = 'content' if is_source else module_name
            extracted = 'extract' in combined
            pdictize = combined.get('listize') if extracted else True

            combined.setdefault('assign', def_assign)
            combined.setdefault('emit', is_source)
            combined.setdefault('pdictize', pdictize)
            conf = {k: combined[k] for k in self.defaults}
            conf.update(kwargs.get('conf', {}))
            combined.update({'conf': conf})

            uconf = DotDict(conf) if combined.get('dictize') else conf
            updates = {'conf': uconf, 'assign': combined.get('assign')}
            kwargs.update(updates)

            item = item or {}
            _input = DotDict(item) if combined.get('dictize') else item
            bfuncs = get_broadcast_funcs(**combined)
            skip = get_skip(_input, **combined)
            types = set([]) if skip else {combined['ftype'], combined['ptype']}

            if types.difference({'pass', 'none'}):
                dfuncs = get_dispatch_funcs(**combined)
            else:
                dfuncs = None

            parsed, orig_item = _dispatch(_input, bfuncs, dfuncs=dfuncs)
            kwargs.update({'skip': skip, 'stream': orig_item})

            if self. async:
                stream = yield pipe(*parsed, **kwargs)
            else:
                stream = pipe(*parsed, **kwargs)

            one, assignment = get_assignment(stream, skip=skip, **combined)

            if skip or combined.get('emit'):
                stream = assignment
            elif not skip:
                stream = assign(_input, assignment, one=one, **combined)

            if self. async:
                return_value(stream)
            else:
                for s in stream:
                    yield s
Ejemplo n.º 7
0
Archivo: regex.py Proyecto: nerevu/riko
 def async_reducer(item, rules):
     field = rules[0]['field']
     word = item.get(field, **kwargs)
     grouped = group_by(rules, 'flags')
     group_rules = [g[1] for g in grouped] if multi else rules
     reducer = multi_substitute if multi else substitute
     replacement = yield ait.coop_reduce(reducer, group_rules, word)
     combined = merge([item, {field: replacement}])
     return_value(DotDict(combined))
Ejemplo n.º 8
0
 def async_reducer(item, rules):
     field = rules[0]['field']
     word = item.get(field, **kwargs)
     grouped = group_by(rules, 'flags')
     group_rules = [g[1] for g in grouped] if multi else rules
     reducer = multi_substitute if multi else substitute
     replacement = yield ait.coop_reduce(reducer, group_rules, word)
     combined = merge([item, {field: replacement}])
     return_value(DotDict(combined))
Ejemplo n.º 9
0
        def wrapper(item=None, **kwargs):
            module_name = wrapper.__module__.split('.')[-1]

            defaults = {
                'dictize': True, 'ftype': 'pass', 'ptype': 'pass',
                'objectify': True}

            combined = merge([self.defaults, defaults, self.opts, kwargs])
            is_source = combined['ftype'] == 'none'
            def_assign = 'content' if is_source else module_name
            extracted = 'extract' in combined
            pdictize = combined.get('listize') if extracted else True

            combined.setdefault('assign', def_assign)
            combined.setdefault('emit', is_source)
            combined.setdefault('pdictize', pdictize)
            conf = {k: combined[k] for k in self.defaults}
            conf.update(kwargs.get('conf', {}))
            combined.update({'conf': conf})

            uconf = DotDict(conf) if combined.get('dictize') else conf
            updates = {'conf': uconf, 'assign': combined.get('assign')}
            kwargs.update(updates)

            item = item or {}
            _input = DotDict(item) if combined.get('dictize') else item
            bfuncs = get_broadcast_funcs(**combined)
            skip = get_skip(_input, **combined)
            types = set([]) if skip else {combined['ftype'], combined['ptype']}

            if types.difference({'pass', 'none'}):
                dfuncs = get_dispatch_funcs(**combined)
            else:
                dfuncs = None

            parsed, orig_item = _dispatch(_input, bfuncs, dfuncs=dfuncs)
            kwargs.update({'skip': skip, 'stream': orig_item})

            if self.async:
                stream = yield pipe(*parsed, **kwargs)
            else:
                stream = pipe(*parsed, **kwargs)

            one, assignment = get_assignment(stream, skip=skip, **combined)

            if skip or combined.get('emit'):
                stream = assignment
            elif not skip:
                stream = assign(_input, assignment, one=one, **combined)

            if self.async:
                return_value(stream)
            else:
                for s in stream:
                    yield s
Ejemplo n.º 10
0
def memoize(*args, **kwargs):
    _cache_type = kwargs.pop('cache_type', 'simple')
    namespace = kwargs.pop('namespace', DEF_NS)
    cache_type = get_cache_type() if _cache_type == 'auto' else _cache_type
    config = merge([MEMOIZE_DEFAULTS, CACHE_CONFIGS[cache_type]])

    if 'CACHE_TIMEOUT' in kwargs:
        config['CACHE_TIMEOUT'] = kwargs.pop('CACHE_TIMEOUT')

    if 'CACHE_THRESHOLD' in kwargs:
        config['CACHE_THRESHOLD'] = kwargs.pop('CACHE_THRESHOLD')

    cache = Cache(namespace=namespace, **config)
    return cache.memoize(*args, **kwargs)
Ejemplo n.º 11
0
Archivo: csv.py Proyecto: nerevu/riko
def async_parser(_, objconf, skip=False, **kwargs):
    """ Asynchronously parses the pipe content

    Args:
        _ (None): Ignored
        objconf (obj): The pipe configuration (an Objectify instance)
        skip (bool): Don't parse the content
        kwargs (dict): Keyword arguments

    Kwargs:
        stream (dict): The original item

    Returns:
        Iter[dict]: The stream of items

    Examples:
        >>> from riko import get_path
        >>> from riko.bado import react
        >>> from riko.bado.mock import FakeReactor
        >>> from meza.fntools import Objectify
        >>>
        >>> def run(reactor):
        ...     callback = lambda x: print(next(x)['mileage'])
        ...     url = get_path('spreadsheet.csv')
        ...     conf = {
        ...         'url': url, 'sanitize': True, 'skip_rows': 0,
        ...         'encoding': ENCODING}
        ...     objconf = Objectify(conf)
        ...     d = async_parser(None, objconf, stream={})
        ...     return d.addCallbacks(callback, logger.error)
        >>>
        >>> try:
        ...     react(run, _reactor=FakeReactor())
        ... except SystemExit:
        ...     pass
        ...
        7213
    """
    if skip:
        stream = kwargs['stream']
    else:
        url = get_abspath(objconf.url)
        r = yield io.async_url_open(url)
        first_row, custom_header = objconf.skip_rows, objconf.col_names
        renamed = {'first_row': first_row, 'custom_header': custom_header}
        rkwargs = merge([objconf, renamed])
        stream = auto_close(read_csv(r, **rkwargs), r)

    return_value(stream)
Ejemplo n.º 12
0
Archivo: csv.py Proyecto: sottom/riko
def async_parser(_, objconf, skip=False, **kwargs):
    """ Asynchronously parses the pipe content

    Args:
        _ (None): Ignored
        objconf (obj): The pipe configuration (an Objectify instance)
        skip (bool): Don't parse the content
        kwargs (dict): Keyword arguments

    Kwargs:
        stream (dict): The original item

    Returns:
        Iter[dict]: The stream of items

    Examples:
        >>> from riko import get_path
        >>> from riko.bado import react
        >>> from riko.bado.mock import FakeReactor
        >>> from meza.fntools import Objectify
        >>>
        >>> def run(reactor):
        ...     callback = lambda x: print(next(x)['mileage'])
        ...     url = get_path('spreadsheet.csv')
        ...     conf = {
        ...         'url': url, 'sanitize': True, 'skip_rows': 0,
        ...         'encoding': ENCODING}
        ...     objconf = Objectify(conf)
        ...     d = async_parser(None, objconf, stream={})
        ...     return d.addCallbacks(callback, logger.error)
        >>>
        >>> try:
        ...     react(run, _reactor=FakeReactor())
        ... except SystemExit:
        ...     pass
        ...
        7213
    """
    if skip:
        stream = kwargs['stream']
    else:
        url = get_abspath(objconf.url)
        r = yield io.async_url_open(url)
        first_row, custom_header = objconf.skip_rows, objconf.col_names
        renamed = {'first_row': first_row, 'custom_header': custom_header}
        rkwargs = merge([objconf, renamed])
        stream = auto_close(read_csv(r, **rkwargs), r)

    return_value(stream)
Ejemplo n.º 13
0
def parse_conf(item, **kwargs):
    kw = Objectify(kwargs, defaults={}, conf={})
    # TODO: fix so .items() returns a DotDict instance
    # parsed = {k: get_value(item, v) for k, v in kw.conf.items()}
    sentinel = {'subkey', 'value', 'terminal'}
    not_dict = not hasattr(kw.conf, 'keys')

    if not_dict or (len(kw.conf) == 1 and sentinel.intersection(kw.conf)):
        objectified = get_value(item, **kwargs)
    else:
        no_conf = remove_keys(kwargs, 'conf')
        parsed = {k: get_value(item, kw.conf[k], **no_conf) for k in kw.conf}
        result = merge([kw.defaults, parsed])
        objectified = Objectify(result) if kw.objectify else result

    return objectified
Ejemplo n.º 14
0
def parse_conf(item, **kwargs):
    kw = Objectify(kwargs, defaults={}, conf={})
    # TODO: fix so .items() returns a DotDict instance
    # parsed = {k: get_value(item, v) for k, v in kw.conf.items()}
    sentinel = {'subkey', 'value', 'terminal'}
    not_dict = not hasattr(kw.conf, 'keys')

    if not_dict or (len(kw.conf) == 1 and sentinel.intersection(kw.conf)):
        objectified = get_value(item, **kwargs)
    else:
        no_conf = remove_keys(kwargs, 'conf')
        parsed = {k: get_value(item, kw.conf[k], **no_conf) for k in kw.conf}
        result = merge([kw.defaults, parsed])
        objectified = Objectify(result) if kw.objectify else result

    return objectified
Ejemplo n.º 15
0
def assign(item, assignment, key, one=False):
    value = next(assignment) if one else list(assignment)
    yield DotDict(merge([item, {key: value}]))
Ejemplo n.º 16
0
    def test_merge(self):
        expected = {'a': 1, 'b': 10, 'c': 11}
        result = pr.merge([{'a': 1, 'b': 2}, {'b': 10, 'c': 11}])
        nt.assert_equal(expected, result)

        # setup
        records = [{'a': 1, 'b': 2, 'c': 3}, {'b': 4, 'c': 5, 'd': 6}]

        # Combine all keys
        expected = {u'a': 1, u'c': 8, u'b': 6, u'd': 6}
        result = pr.merge(records, pred=bool, op=sum)
        nt.assert_equal(expected, result)

        first = lambda pair: next(filter(partial(is_not, None), pair))
        kwargs = {'pred': bool, 'op': first, 'default': None}
        expected = {u'a': 1, u'b': 2, u'c': 3, u'd': 6}
        result = pr.merge(records, **kwargs)
        nt.assert_equal(expected, result)

        # This will only reliably give the expected result for 2 records
        kwargs = {'pred': bool, 'op': stats.mean, 'default': None}
        expected = {u'a': 1, u'b': 3.0, u'c': 4.0, u'd': 6.0}
        result = pr.merge(records, **kwargs)
        nt.assert_equal(expected, result)

        # Only combine key 'b'
        expected = {u'a': 1, u'b': 6, u'c': 5, u'd': 6}
        result = pr.merge(records, pred='b', op=sum)
        nt.assert_equal(expected, result)

        # Only combine keys that have the same value of 'b'
        expected = {u'a': 1, u'b': 6, u'c': 5, u'd': 6}
        result = pr.merge(records, pred=itemgetter('b'), op=sum)
        nt.assert_equal(expected, result)

        # This will reliably work for any number of records
        counted = defaultdict(int)

        records = [
            {'a': 1, 'b': 4, 'c': 0},
            {'a': 2, 'b': 5, 'c': 2},
            {'a': 3, 'b': 6, 'd': 7}]

        for r in records:
            for k in r.keys():
                counted[k] += 1

        expected = {u'a': 3, u'b': 3, u'c': 2, u'd': 1}
        nt.assert_equal(expected, counted)

        summed = pr.merge(records, pred=bool, op=sum)
        expected = {u'a': 6, u'b': 15, u'c': 2, u'd': 7}
        nt.assert_equal(expected, summed)

        kwargs = {'pred': bool, 'op': ft.fpartial(truediv)}
        expected = {u'a': 2.0, u'b': 5.0, u'c': 1.0, u'd': 7.0}
        result = pr.merge([summed, counted], **kwargs)
        nt.assert_equal(expected, result)

        # This should also reliably work for any number of records
        op = ft.fpartial(ft.sum_and_count)
        kwargs = {'pred': bool, 'op': op, 'default': None}
        merged = pr.merge(records, **kwargs)
        result = {x: truediv(*y) for x, y in merged.items()}
        nt.assert_equal(expected, result)
Ejemplo n.º 17
0
def assign(item, assignment, **kwargs):
    key = kwargs.get('assign')
    value = next(assignment) if kwargs.get('one') else list(assignment)
    merged = merge([item, {key: value}])
    yield DotDict(merged) if kwargs.get('dictize') else merged
Ejemplo n.º 18
0
    def test_merge(self):
        expected = {"a": 1, "b": 10, "c": 11}
        result = pr.merge([{"a": 1, "b": 2}, {"b": 10, "c": 11}])
        nt.assert_equal(expected, result)

        # setup
        records = [{"a": 1, "b": 2, "c": 3}, {"b": 4, "c": 5, "d": 6}]

        # Combine all keys
        expected = {u"a": 1, u"c": 8, u"b": 6, u"d": 6}
        result = pr.merge(records, pred=bool, op=sum)
        nt.assert_equal(expected, result)

        first = lambda pair: next(filter(partial(is_not, None), pair))
        kwargs = {"pred": bool, "op": first, "default": None}
        expected = {u"a": 1, u"b": 2, u"c": 3, u"d": 6}
        result = pr.merge(records, **kwargs)
        nt.assert_equal(expected, result)

        # This will only reliably give the expected result for 2 records
        kwargs = {"pred": bool, "op": stats.mean, "default": None}
        expected = {u"a": 1, u"b": 3.0, u"c": 4.0, u"d": 6.0}
        result = pr.merge(records, **kwargs)
        nt.assert_equal(expected, result)

        # Only combine key 'b'
        expected = {u"a": 1, u"b": 6, u"c": 5, u"d": 6}
        result = pr.merge(records, pred="b", op=sum)
        nt.assert_equal(expected, result)

        # Only combine keys that have the same value of 'b'
        expected = {u"a": 1, u"b": 6, u"c": 5, u"d": 6}
        result = pr.merge(records, pred=itemgetter("b"), op=sum)
        nt.assert_equal(expected, result)

        # This will reliably work for any number of records
        counted = defaultdict(int)

        records = [
            {
                "a": 1,
                "b": 4,
                "c": 0
            },
            {
                "a": 2,
                "b": 5,
                "c": 2
            },
            {
                "a": 3,
                "b": 6,
                "d": 7
            },
        ]

        for r in records:
            for k in r.keys():
                counted[k] += 1

        expected = {u"a": 3, u"b": 3, u"c": 2, u"d": 1}
        nt.assert_equal(expected, counted)

        summed = pr.merge(records, pred=bool, op=sum)
        expected = {u"a": 6, u"b": 15, u"c": 2, u"d": 7}
        nt.assert_equal(expected, summed)

        kwargs = {"pred": bool, "op": ft.fpartial(truediv)}
        expected = {u"a": 2.0, u"b": 5.0, u"c": 1.0, u"d": 7.0}
        result = pr.merge([summed, counted], **kwargs)
        nt.assert_equal(expected, result)

        # This should also reliably work for any number of records
        op = ft.fpartial(ft.sum_and_count)
        kwargs = {"pred": bool, "op": op, "default": None}
        merged = pr.merge(records, **kwargs)
        result = {x: truediv(*y) for x, y in merged.items()}
        nt.assert_equal(expected, result)
Ejemplo n.º 19
0
def getpipe(args, pipe=SyncPipe):
    source, sleep = args
    ptype = source.get('type', 'fetch')
    conf = merge([{'sleep': sleep}, source])
    return pipe(ptype, conf=conf).list
Ejemplo n.º 20
0
        def wrapper(items=None, **kwargs):
            module_name = wrapper.__module__.split('.')[-1]
            wrapper.__dict__['name'] = module_name

            defaults = {
                'dictize': True, 'ftype': 'pass', 'ptype': 'pass',
                'objectify': True, 'emit': True, 'assign': module_name}

            combined = merge([self.defaults, defaults, self.opts, kwargs])
            extracted = 'extract' in combined
            pdictize = combined.get('listize') if extracted else True

            combined.setdefault('pdictize', pdictize)
            conf = {k: combined[k] for k in self.defaults}
            conf.update(kwargs.get('conf', {}))
            combined.update({'conf': conf})

            uconf = DotDict(conf) if combined.get('dictize') else conf
            updates = {'conf': uconf, 'assign': combined.get('assign')}
            kwargs.update(updates)

            items = items or iter([])
            _INPUT = map(DotDict, items) if combined.get('dictize') else items
            bfuncs = get_broadcast_funcs(**combined)
            types = {combined['ftype'], combined['ptype']}

            if types.difference({'pass', 'none'}):
                dfuncs = get_dispatch_funcs(**combined)
            else:
                dfuncs = None

            pairs = (_dispatch(item, bfuncs, dfuncs=dfuncs) for item in _INPUT)
            parsed, _ = _dispatch(DotDict(), bfuncs, dfuncs=dfuncs)

            # - operators can't skip items
            # - purposely setting both variables to maps of the same iterable
            #   since only one is intended to be used at any given time
            # - `tuples` is an iterator of tuples of the first two `parsed`
            #   elements
            tuples = ((p[0][0], p[0][1]) for p in pairs)
            orig_stream = (p[0][0] for p in pairs)
            objconf = parsed[1]

            if self.async:
                stream = yield pipe(orig_stream, objconf, tuples, **kwargs)
            else:
                stream = pipe(orig_stream, objconf, tuples, **kwargs)

            sub_type = 'aggregator' if hasattr(stream, 'keys') else 'composer'
            wrapper.__dict__['sub_type'] = sub_type

            # operators can only assign one value per item and can't skip items
            _, assignment = get_assignment(stream, **combined)

            if combined.get('emit'):
                stream = assignment
            else:
                singles = (iter([v]) for v in assignment)
                assigned = (
                    assign({}, s, one=True, **combined) for s in singles)

                stream = multiplex(assigned)

            if self.async:
                return_value(stream)
            else:
                for s in stream:
                    yield s
Ejemplo n.º 21
0
def assign(item, assignment, **kwargs):
    key = kwargs.get('assign')
    value = next(assignment) if kwargs.get('one') else list(assignment)
    merged = merge([item, {key: value}])
    yield DotDict(merged) if kwargs.get('dictize') else merged
Ejemplo n.º 22
0
def getpipe(args, pipe=SyncPipe):
    source, conf = args
    ptype = source.get('type', 'fetch')
    return pipe(ptype, conf=merge([conf, source])).output
Ejemplo n.º 23
0
        def wrapper(items=None, **kwargs):
            module_name = wrapper.__module__.split('.')[-1]
            wrapper.__dict__['name'] = module_name

            defaults = {
                'dictize': True, 'ftype': 'pass', 'ptype': 'pass',
                'objectify': True, 'emit': True, 'assign': module_name}

            combined = merge([self.defaults, defaults, self.opts, kwargs])
            extracted = 'extract' in combined
            pdictize = combined.get('listize') if extracted else True

            combined.setdefault('pdictize', pdictize)
            conf = {k: combined[k] for k in self.defaults}
            conf.update(kwargs.get('conf', {}))
            combined.update({'conf': conf})

            # replace conf with dictized version so we can access its
            # attributes even if we already extracted a value
            updates = {'conf': DotDict(conf), 'assign': combined.get('assign')}
            kwargs.update(updates)

            items = items or iter([])
            _INPUT = map(DotDict, items) if combined.get('dictize') else items
            bfuncs = get_broadcast_funcs(**combined)
            types = {combined['ftype'], combined['ptype']}

            if types.difference({'pass', 'none'}):
                dfuncs = get_dispatch_funcs(**combined)
            else:
                dfuncs = None

            pairs = (_dispatch(item, bfuncs, dfuncs=dfuncs) for item in _INPUT)
            parsed, _ = _dispatch(DotDict(), bfuncs, dfuncs=dfuncs)

            # - operators can't skip items
            # - purposely setting both variables to maps of the same iterable
            #   since only one is intended to be used at any given time
            # - `tuples` is an iterator of tuples of the first two `parsed`
            #   elements
            tuples = ((p[0][0], p[0][1]) for p in pairs)
            orig_stream = (p[0][0] for p in pairs)
            objconf = parsed[1]

            if self.async:
                stream = yield pipe(orig_stream, objconf, tuples, **kwargs)
            else:
                stream = pipe(orig_stream, objconf, tuples, **kwargs)

            sub_type = 'aggregator' if hasattr(stream, 'keys') else 'composer'
            wrapper.__dict__['sub_type'] = sub_type

            # operators can only assign one value per item and can't skip items
            _, assignment = get_assignment(stream, **combined)

            if combined.get('emit'):
                stream = assignment
            else:
                singles = (iter([v]) for v in assignment)
                key = combined.get('assign')
                assigned = (assign({}, s, key, one=True) for s in singles)
                stream = multiplex(assigned)

            if self.async:
                return_value(stream)
            else:
                for s in stream:
                    yield s
Ejemplo n.º 24
0
def parser(stream, objconf, tuples, **kwargs):
    """ Parses the pipe content

    Args:
        stream (Iter[dict]): The source. Note: this shares the `tuples`
            iterator, so consuming it will consume `tuples` as well.

        objconf (obj): The pipe configuration (an Objectify instance)

        tuples (Iter[(dict, obj)]): Iterable of tuples of (item, objconf)
            `item` is an element in the source stream and `objconf` is the item
            configuration (an Objectify instance). Note: this shares the
            `stream` iterator, so consuming it will consume `stream` as well.

        kwargs (dict): Keyword arguments.

    Kwargs:
        other (Iter[dict]): stream to join

    Returns:
        Iter(dict): The output stream

    Examples:
        >>> from itertools import repeat
        >>> from meza.fntools import Objectify
        >>>
        >>> stream = ({'x': 'foo', 'sum': x} for x in range(5))
        >>> other = ({'x': 'foo', 'count': x + 5} for x in range(5))
        >>> objconf = Objectify({})
        >>> tuples = zip(stream, repeat(objconf))
        >>> joined = parser(stream, objconf, tuples, other=other)
        >>> next(joined) == {'x': 'foo', 'sum': 0, 'count': 5}
        True
        >>> len(list(joined))
        24
        >>> objconf = Objectify({'join_key': 'x', 'other_join_key': 'y'})
        >>> stream = ({'x': 'foo-%s' % x, 'sum': x} for x in range(5))
        >>> other = ({'y': 'foo-%s' % x, 'count': x + 5} for x in range(5))
        >>> tuples = zip(stream, repeat(objconf))
        >>> joined = parser(stream, objconf, tuples, other=other)
        >>> next(joined) == {'count': 5, 'x': 'foo-0', 'sum': 0, 'y': 'foo-0'}
        True
        >>> len(list(joined))
        4
    """
    def compare(x, y):
        if objconf.lower:
            x_value, y_value = x.get(x_key, ''), y.get(y_key, '')
            equal = x_value.lower() == y_value.lower()
        else:
            equal = x.get(x_key) == y.get(y_key)

        return equal

    if objconf.join_key or objconf.other_join_key:
        x_key = objconf.join_key or objconf.other_join_key
        y_key = objconf.other_join_key or x_key
        prod = product(stream, kwargs['other'])

        joined = (merge([x, y]) for x, y in prod if compare(x, y))
    else:
        joined = join(stream, kwargs['other'])

    return joined
Ejemplo n.º 25
0
    def test_merge(self):
        expected = {'a': 1, 'b': 10, 'c': 11}
        result = pr.merge([{'a': 1, 'b': 2}, {'b': 10, 'c': 11}])
        nt.assert_equal(expected, result)

        # setup
        records = [{'a': 1, 'b': 2, 'c': 3}, {'b': 4, 'c': 5, 'd': 6}]

        # Combine all keys
        expected = {u'a': 1, u'c': 8, u'b': 6, u'd': 6}
        result = pr.merge(records, pred=bool, op=sum)
        nt.assert_equal(expected, result)

        first = lambda pair: next(filter(partial(is_not, None), pair))
        kwargs = {'pred': bool, 'op': first, 'default': None}
        expected = {u'a': 1, u'b': 2, u'c': 3, u'd': 6}
        result = pr.merge(records, **kwargs)
        nt.assert_equal(expected, result)

        # This will only reliably give the expected result for 2 records
        kwargs = {'pred': bool, 'op': stats.mean, 'default': None}
        expected = {u'a': 1, u'b': 3.0, u'c': 4.0, u'd': 6.0}
        result = pr.merge(records, **kwargs)
        nt.assert_equal(expected, result)

        # Only combine key 'b'
        expected = {u'a': 1, u'b': 6, u'c': 5, u'd': 6}
        result = pr.merge(records, pred='b', op=sum)
        nt.assert_equal(expected, result)

        # Only combine keys that have the same value of 'b'
        expected = {u'a': 1, u'b': 6, u'c': 5, u'd': 6}
        result = pr.merge(records, pred=itemgetter('b'), op=sum)
        nt.assert_equal(expected, result)

        # This will reliably work for any number of records
        counted = defaultdict(int)

        records = [{
            'a': 1,
            'b': 4,
            'c': 0
        }, {
            'a': 2,
            'b': 5,
            'c': 2
        }, {
            'a': 3,
            'b': 6,
            'd': 7
        }]

        for r in records:
            for k in r.keys():
                counted[k] += 1

        expected = {u'a': 3, u'b': 3, u'c': 2, u'd': 1}
        nt.assert_equal(expected, counted)

        summed = pr.merge(records, pred=bool, op=sum)
        expected = {u'a': 6, u'b': 15, u'c': 2, u'd': 7}
        nt.assert_equal(expected, summed)

        kwargs = {'pred': bool, 'op': ft.fpartial(truediv)}
        expected = {u'a': 2.0, u'b': 5.0, u'c': 1.0, u'd': 7.0}
        result = pr.merge([summed, counted], **kwargs)
        nt.assert_equal(expected, result)

        # This should also reliably work for any number of records
        op = ft.fpartial(ft.sum_and_count)
        kwargs = {'pred': bool, 'op': op, 'default': None}
        merged = pr.merge(records, **kwargs)
        result = {x: truediv(*y) for x, y in merged.items()}
        nt.assert_equal(expected, result)
Ejemplo n.º 26
0
def reducer(item, rule):
    new_dict = {rule.newval: item.get(rule.field)} if rule.newval else {}
    old_dict = item if rule.copy else remove_keys(item, rule.field)
    return DotDict(merge([old_dict, new_dict]))
Ejemplo n.º 27
0
def reducer(item, rule):
    new_dict = {rule.newval: item.get(rule.field)} if rule.newval else {}
    old_dict = item if rule.copy else remove_keys(item, rule.field)
    return DotDict(merge([old_dict, new_dict]))