Ejemplo n.º 1
0
def pipe_feedautodiscovery(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that searches for and returns feed links found in a page.
    Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)

            if context and context.verbose:
                print "pipe_feedautodiscovery loading:", url

            for entry in autorss.getRSSLink(url.encode('utf-8')):
                yield {'link': entry}
                # todo: add rel, type, title

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Ejemplo n.º 2
0
def pipe_fetchsitefeed(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches and parses the first feed found on one or more
    sites. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)

            if context and context.verbose:
                print "pipe_fetchsitefeed loading:", url

            for link in autorss.getRSSLink(url.encode('utf-8')):
                parsed = speedparser.parse(urlopen(link).read())

                for entry in utils.gen_entries(parsed):
                    yield entry

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Ejemplo n.º 3
0
def pipe_yql(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that issues YQL queries. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : yqlquery -- YQL query
        # todo: handle envURL

    Yields
    ------
    _OUTPUT : query results
    """
    # todo: get from a config/env file
    url = "http://query.yahooapis.com/v1/public/yql"
    conf = DotDict(conf)
    query = conf['yqlquery']

    for item in _INPUT:
        item = DotDict(item)
        yql = utils.get_value(query, item, **kwargs)

        # note: we use the default format of xml since json loses some
        # structure
        # todo: diagnostics=true e.g. if context.test
        # todo: consider paging for large result sets
        r = requests.get(url, params={'q': yql}, stream=True)

        # Parse the response
        tree = parse(r.raw)

        if context and context.verbose:
            print "pipe_yql loading xml:", yql

        root = tree.getroot()

        # note: query also has row count
        results = root.find('results')

        # Convert xml into generation of dicts
        for element in results.getchildren():
            yield utils.etree_to_dict(element)

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Ejemplo n.º 4
0
def pipe_datebuilder(context=None, _INPUT=None, conf=None, **kwargs):
    """A date module that converts a text string into a datetime value. Useful
    as terminal data. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items
    conf : {'DATE': {'type': 'datetime', 'value': '12/2/2014'}}

    Yields
    ------
    _OUTPUT : date timetuples
    """
    conf = DotDict(conf)

    for item in _INPUT:
        _input = DotDict(item)
        date = utils.get_value(conf['DATE'], _input, **kwargs).lower()

        if date.endswith(' day') or date.endswith(' days'):
            count = int(date.split(' ')[0])
            new_date = dt.today() + timedelta(days=count)
        elif date.endswith(' year') or date.endswith(' years'):
            count = int(date.split(' ')[0])
            new_date = dt.today().replace(year=dt.today().year + count)
        else:
            new_date = SWITCH.get(date)

        if not new_date:
            new_date = utils.get_date(date)

        if not new_date:
            raise Exception('Unrecognized date string: %s' % date)

        yield new_date.timetuple()
Ejemplo n.º 5
0
def pipe_csv(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches and parses a csv file to yield items. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url
        skip -- number of header rows to skip
        col_mode -- column name source: row=header row(s),
                    custom=defined in col_name
        col_name -- list of custom column names
        col_row_start -- first column header row
        col_row_end -- last column header row
        separator -- column separator

    Yields
    ------
    _OUTPUT : items

    Note:
    Current restrictions:
      separator must be 1 character
      assumes every row has exactly the expected number of fields, as defined
      in the header
    """
    conf = DotDict(conf)
    conf_sep = conf['separator']
    conf_mode = conf['col_mode']
    col_name = conf['col_name']

    for item in _INPUT:
        item = DotDict(item)
        url = utils.get_value(conf['URL'], item, **kwargs)
        url = utils.get_abspath(url)
        separator = utils.get_value(conf_sep, item, encode=True, **kwargs)
        skip = int(utils.get_value(conf['skip'], item, **kwargs))
        col_mode = utils.get_value(conf_mode, item, **kwargs)

        f = urlopen(url)

        if context and context.verbose:
            print "pipe_csv loading:", url

        for i in xrange(skip):
            f.next()

        reader = csv.UnicodeReader(f, delimiter=separator)
        fieldnames = []

        if col_mode == 'custom':
            fieldnames = [DotDict(x).get() for x in col_name]
        else:
            fieldnames = _gen_fieldnames(conf, reader, item, **kwargs)

        for rows in reader:
            yield dict(zip(fieldnames, rows))

        f.close()

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Ejemplo n.º 6
0
def _gen_fieldnames(conf, reader, item, **kwargs):
    start = int(utils.get_value(conf['col_row_start'], item, **kwargs))
    end = int(utils.get_value(conf['col_row_end'], item, **kwargs))

    for i in xrange((end - start) + 1):
        yield reader.next()
Ejemplo n.º 7
0
def pipe_fetchpage(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches the content of a given web site as a string.
    Loopable.

    context : pipe2py.Context object
    _INPUT : pipeforever asyncPipe or an iterable of items or fields

    conf : dict
       URL -- url object contain the URL to download
       from -- string from where to start the input
       to -- string to limit the input
       token -- if present, split the input on this token to generate items

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#FetchPage

       TODOS:
        - don't retrieve pages larger than 200k
        - don't retrieve if page is not indexable.
        - item delimiter removes the closing tag if using a HTML tag
          (not documented but happens)
        - items should be cleaned, i.e. stripped of HTML tags

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    split_token = conf.get('token', **kwargs)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)

            if not url:
                continue

            f = urlopen(url)

            # TODO: it seems that Yahoo! converts relative links to
            # absolute. This needs to be done on the content but seems to
            # be a non-trival task python?
            content = unicode(f.read(), 'utf-8')

            if context and context.verbose:
                print '............Content .................'
                print content
                print '...............EOF...................'

            parsed = _parse_content(content, conf, **kwargs)
            items = parsed.split(split_token) if split_token else [parsed]

            if context and context.verbose:
                print "FetchPage: found count items:", len(items)

            for i in items:
                if context and context.verbose:
                    print "--------------item data --------------------"
                    print i
                    print "--------------EOF item data ----------------"

                yield {"content": i}

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Ejemplo n.º 8
0
def pipe_csv(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches and parses a csv file to yield items. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url
        skip -- number of header rows to skip
        col_mode -- column name source: row=header row(s),
                    custom=defined in col_name
        col_name -- list of custom column names
        col_row_start -- first column header row
        col_row_end -- last column header row
        separator -- column separator

    Yields
    ------
    _OUTPUT : items

    Note:
    Current restrictions:
      separator must be 1 character
      assumes every row has exactly the expected number of fields, as defined
      in the header
    """
    conf = DotDict(conf)
    conf_sep = conf["separator"]
    conf_mode = conf["col_mode"]
    col_name = conf["col_name"]

    for item in _INPUT:
        item = DotDict(item)
        url = utils.get_value(conf["URL"], item, **kwargs)
        url = utils.get_abspath(url)
        separator = utils.get_value(conf_sep, item, encode=True, **kwargs)
        skip = int(utils.get_value(conf["skip"], item, **kwargs))
        col_mode = utils.get_value(conf_mode, item, **kwargs)

        f = urlopen(url)

        if context and context.verbose:
            print "pipe_csv loading:", url

        for i in xrange(skip):
            f.next()

        reader = csv.UnicodeReader(f, delimiter=separator)
        fieldnames = []

        if col_mode == "custom":
            fieldnames = [DotDict(x).get() for x in col_name]
        else:
            fieldnames = _gen_fieldnames(conf, reader, item, **kwargs)

        for rows in reader:
            yield dict(zip(fieldnames, rows))

        f.close()

        if item.get("forever"):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Ejemplo n.º 9
0
def _gen_fieldnames(conf, reader, item, **kwargs):
    start = int(utils.get_value(conf["col_row_start"], item, **kwargs))
    end = int(utils.get_value(conf["col_row_end"], item, **kwargs))

    for i in xrange((end - start) + 1):
        yield reader.next()
Ejemplo n.º 10
0
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches the content of a given website as DOM nodes or a
    string. Loopable.

    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : dict
       URL -- url object contain the URL to download
       xpath -- xpath to extract
       html5 -- use html5 parser?
       useAsString -- emit items as string?

       TODOS:
        - don't retrieve pages larger than 1.5MB
        - don't retrieve if page is not indexable.

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)
            f = urlopen(url)

            # TODO: it seems that Yahoo! converts relative links to
            # absolute. This needs to be done on the content but seems to
            # be a non-trival task python?
            content = unicode(f.read(), 'utf-8')

            if context and context.verbose:
                print '............Content .................'
                print content
                print '...............EOF...................'

            xpath = conf.get('xpath', **kwargs)
            html5 = conf.get('html5', **kwargs) == 'true'
            use_as_string = conf.get('useAsString', **kwargs) == 'true'
            tree = html5parser.parse(f) if html5 else html.parse(f)
            root = tree.getroot()
            items = root.xpath(xpath)

            if context and context.verbose:
                print 'XPathFetchPage: found count items:', len(items)

            for etree in items:
                i = utils.etree_to_dict(etree)

                if context and context.verbose:
                    print '--------------item data --------------------'
                    print i
                    print '--------------EOF item data ----------------'

                if use_as_string:
                    yield {'content': unicode(i)}
                else:
                    yield i

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break