Ejemplo n.º 1
0
def pipe_sort(context=None, _INPUT=None, conf=None, **kwargs):
    """An operator that sorts the input source according to the specified key.
    Not loopable. Not lazy.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipe2py.modules pipe like object (iterable of items)
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf : {
        'KEY': [
            {
                'field': {'type': 'text', 'value': 'title'},
                'dir': {'type': 'text', 'value': 'DESC'}
            }
        ]
    }

    Returns
    -------
    _OUTPUT : generator of sorted items
    """
    test = kwargs.pop('pass_if', None)
    _pass = utils.get_pass(test=test)
    key_defs = imap(DotDict, utils.listize(conf['KEY']))
    get_value = partial(utils.get_value, **kwargs)
    parse_conf = partial(utils.parse_conf, parse_func=get_value, **kwargs)
    keys = imap(parse_conf, key_defs)
    order = ('%s%s' % ('-' if k.dir == 'DESC' else '', k.field) for k in keys)
    comparers = map(get_comparer, order)
    cmp_func = partial(multikeysort, comparers=comparers)
    _OUTPUT = _INPUT if _pass else iter(sorted(_INPUT, cmp=cmp_func))
    return _OUTPUT
Ejemplo n.º 2
0
def pipe_feedautodiscovery(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that searches for and returns feed links found in a page.
    Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)

            if context and context.verbose:
                print "pipe_feedautodiscovery loading:", url

            for entry in autorss.getRSSLink(url.encode('utf-8')):
                yield {'link': entry}
                # todo: add rel, type, title

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Ejemplo n.º 3
0
def pipe_fetchsitefeed(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches and parses the first feed found on one or more
    sites. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)

            if context and context.verbose:
                print "pipe_fetchsitefeed loading:", url

            for link in autorss.getRSSLink(url.encode('utf-8')):
                parsed = speedparser.parse(urlopen(link).read())

                for entry in utils.gen_entries(parsed):
                    yield entry

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Ejemplo n.º 4
0
def _get_broadcast_funcs(pieces, ftype='with', **kwargs):
    test = kwargs.pop('pass_if', None)
    listize = kwargs.pop('listize', True)
    parse = kwargs.pop('parse', True)
    pdictize = kwargs.pop('pdictize', True)
    cust_func = kwargs.pop('cust_func', False)
    get_value = partial(utils.get_value, **kwargs)
    get_pass = partial(utils.get_pass, test=test)
    get_with = partial(utils.get_with, **kwargs)

    if parse:
        get_func = partial(utils.parse_conf, parse_func=get_value, **kwargs)
    else:
        get_func = get_value

    if listize:
        listed = utils.listize(pieces)
        piece_defs = map(DotDict, listed) if pdictize else listed
        get_pieces = lambda item: imap(get_func, piece_defs, repeat(item))
    else:
        piece_defs = DotDict(pieces) if pdictize else pieces
        get_pieces = partial(get_func, piece_defs)

    return (get_pieces, get_with, get_pass, cust_func)
Ejemplo n.º 5
0
def pipe_filter(context=None, _INPUT=None, conf=None, **kwargs):
    """An operator that filters for source items matching the given rules.
    Not loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipe2py.modules pipe like object (iterable of items)
    conf : {
        'MODE': {'value': <'permit' or 'block'>},
        'COMBINE': {'value': <'and' or 'or'>}
        'RULE': [
            {
                'field': {'value': 'search field'},
                'op': {'value': 'one of SWITCH above'},
                'value': {'value': 'search term'}
            }
        ]
    }

    kwargs : other inputs, e.g., to feed terminals for rule values

    Returns
    -------
    _OUTPUT : generator of filtered items

    Examples
    --------
    >>> import os.path as p
    >>> from pipe2py.modules.pipeforever import pipe_forever
    >>> from pipe2py.modules.pipefetchdata import pipe_fetchdata
    >>> parent = p.dirname(p.dirname(__file__))
    >>> file_name = p.abspath(p.join(parent, 'data', 'gigs.json'))
    >>> path = 'value.items'
    >>> url = 'file://%s' % file_name
    >>> conf = {'URL': {'value': url}, 'path': {'value': path}}
    >>> input = pipe_fetchdata(_INPUT=pipe_forever(), conf=conf)
    >>> mode = {'value': 'permit'}
    >>> combine = {'value': 'and'}
    >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \
'value': {'value': 'web'}}]
    >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule}
    >>> pipe_filter(_INPUT=input, conf=conf).next()['title']
    u'E-Commerce Website Developer | Elance Job'
    >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \
'value': {'value': 'kjhlked'}}]
    >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule}
    >>> list(pipe_filter(_INPUT=input, conf=conf))
    []
    """
    conf = DotDict(conf)
    test = kwargs.pop('pass_if', None)
    permit = conf.get('MODE', **kwargs) == 'permit'
    combine = conf.get('COMBINE', **kwargs)

    if not combine in {'and', 'or'}:
        raise Exception("Invalid combine: %s. (Expected 'and' or 'or')" %
                        combine)

    rule_defs = map(DotDict, utils.listize(conf['RULE']))
    get_pass = partial(utils.get_pass, test=test)
    get_value = partial(utils.get_value, **kwargs)
    parse_conf = partial(utils.parse_conf, parse_func=get_value, **kwargs)
    get_rules = lambda i: imap(parse_conf, rule_defs, repeat(i))
    funcs = [COMBINE_BOOLEAN[combine], utils.passthrough, utils.passthrough]

    inputs = imap(DotDict, _INPUT)
    splits = utils.broadcast(inputs, get_rules, utils.passthrough, get_pass)
    outputs = starmap(partial(parse_rules, **kwargs), splits)
    parsed = utils.dispatch(outputs, *funcs)
    gathered = starmap(partial(parse_result, permit=permit), parsed)
    _OUTPUT = ifilter(None, gathered)
    return _OUTPUT
Ejemplo n.º 6
0
def pipe_filter(context=None, _INPUT=None, conf=None, **kwargs):
    """An operator that filters for source items matching the given rules.
    Not loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipe2py.modules pipe like object (iterable of items)
    conf : {
        'MODE': {'value': <'permit' or 'block'>},
        'COMBINE': {'value': <'and' or 'or'>}
        'RULE': [
            {
                'field': {'value': 'search field'},
                'op': {'value': 'one of SWITCH above'},
                'value': {'value': 'search term'}
            }
        ]
    }

    kwargs : other inputs, e.g., to feed terminals for rule values

    Returns
    -------
    _OUTPUT : generator of filtered items

    Examples
    --------
    >>> import os.path as p
    >>> from pipe2py.modules.pipeforever import pipe_forever
    >>> from pipe2py.modules.pipefetchdata import pipe_fetchdata
    >>> parent = p.dirname(p.dirname(__file__))
    >>> file_name = p.abspath(p.join(parent, 'data', 'gigs.json'))
    >>> path = 'value.items'
    >>> url = 'file://%s' % file_name
    >>> conf = {'URL': {'value': url}, 'path': {'value': path}}
    >>> input = pipe_fetchdata(_INPUT=pipe_forever(), conf=conf)
    >>> mode = {'value': 'permit'}
    >>> combine = {'value': 'and'}
    >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \
'value': {'value': 'web'}}]
    >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule}
    >>> pipe_filter(_INPUT=input, conf=conf).next()['title']
    u'E-Commerce Website Developer | Elance Job'
    >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \
'value': {'value': 'kjhlked'}}]
    >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule}
    >>> list(pipe_filter(_INPUT=input, conf=conf))
    []
    """
    conf = DotDict(conf)
    test = kwargs.pop('pass_if', None)
    permit = conf.get('MODE', **kwargs) == 'permit'
    combine = conf.get('COMBINE', **kwargs)

    if not combine in {'and', 'or'}:
        raise Exception(
            "Invalid combine: %s. (Expected 'and' or 'or')" % combine)

    rule_defs = map(DotDict, utils.listize(conf['RULE']))
    get_pass = partial(utils.get_pass, test=test)
    get_value = partial(utils.get_value, **kwargs)
    parse_conf = partial(utils.parse_conf, parse_func=get_value, **kwargs)
    get_rules = lambda i: imap(parse_conf, rule_defs, repeat(i))
    funcs = [COMBINE_BOOLEAN[combine], utils.passthrough, utils.passthrough]

    inputs = imap(DotDict, _INPUT)
    splits = utils.broadcast(inputs, get_rules, utils.passthrough, get_pass)
    outputs = starmap(partial(parse_rules, **kwargs), splits)
    parsed = utils.dispatch(outputs, *funcs)
    gathered = starmap(partial(parse_result, permit=permit), parsed)
    _OUTPUT = ifilter(None, gathered)
    return _OUTPUT
Ejemplo n.º 7
0
def pipe_fetchpage(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches the content of a given web site as a string.
    Loopable.

    context : pipe2py.Context object
    _INPUT : pipeforever asyncPipe or an iterable of items or fields

    conf : dict
       URL -- url object contain the URL to download
       from -- string from where to start the input
       to -- string to limit the input
       token -- if present, split the input on this token to generate items

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#FetchPage

       TODOS:
        - don't retrieve pages larger than 200k
        - don't retrieve if page is not indexable.
        - item delimiter removes the closing tag if using a HTML tag
          (not documented but happens)
        - items should be cleaned, i.e. stripped of HTML tags

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    split_token = conf.get('token', **kwargs)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)

            if not url:
                continue

            f = urlopen(url)

            # TODO: it seems that Yahoo! converts relative links to
            # absolute. This needs to be done on the content but seems to
            # be a non-trival task python?
            content = unicode(f.read(), 'utf-8')

            if context and context.verbose:
                print '............Content .................'
                print content
                print '...............EOF...................'

            parsed = _parse_content(content, conf, **kwargs)
            items = parsed.split(split_token) if split_token else [parsed]

            if context and context.verbose:
                print "FetchPage: found count items:", len(items)

            for i in items:
                if context and context.verbose:
                    print "--------------item data --------------------"
                    print i
                    print "--------------EOF item data ----------------"

                yield {"content": i}

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Ejemplo n.º 8
0
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches the content of a given website as DOM nodes or a
    string. Loopable.

    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : dict
       URL -- url object contain the URL to download
       xpath -- xpath to extract
       html5 -- use html5 parser?
       useAsString -- emit items as string?

       TODOS:
        - don't retrieve pages larger than 1.5MB
        - don't retrieve if page is not indexable.

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)
            f = urlopen(url)

            # TODO: it seems that Yahoo! converts relative links to
            # absolute. This needs to be done on the content but seems to
            # be a non-trival task python?
            content = unicode(f.read(), 'utf-8')

            if context and context.verbose:
                print '............Content .................'
                print content
                print '...............EOF...................'

            xpath = conf.get('xpath', **kwargs)
            html5 = conf.get('html5', **kwargs) == 'true'
            use_as_string = conf.get('useAsString', **kwargs) == 'true'
            tree = html5parser.parse(f) if html5 else html.parse(f)
            root = tree.getroot()
            items = root.xpath(xpath)

            if context and context.verbose:
                print 'XPathFetchPage: found count items:', len(items)

            for etree in items:
                i = utils.etree_to_dict(etree)

                if context and context.verbose:
                    print '--------------item data --------------------'
                    print i
                    print '--------------EOF item data ----------------'

                if use_as_string:
                    yield {'content': unicode(i)}
                else:
                    yield i

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break