Exemple #1
0
def pipe_feedautodiscovery(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that searches for and returns feed links found in a page.
    Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)

            if context and context.verbose:
                print "pipe_feedautodiscovery loading:", url

            for entry in autorss.getRSSLink(url.encode('utf-8')):
                yield {'link': entry}
                # todo: add rel, type, title

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Exemple #2
0
def pipe_fetchsitefeed(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches and parses the first feed found on one or more
    sites. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)

            if context and context.verbose:
                print "pipe_fetchsitefeed loading:", url

            for link in autorss.getRSSLink(url.encode('utf-8')):
                parsed = speedparser.parse(urlopen(link).read())

                for entry in utils.gen_entries(parsed):
                    yield entry

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Exemple #3
0
def parse_conf(conf):
    url = utils.get_abspath(conf.URL)
    path = conf.path.split('.') if conf.path else []
    return (url, path, None)
Exemple #4
0
def asyncGetDefaultRateData(err=False):
    path = join('..', 'data', 'quote.json')
    url = utils.get_abspath(path)
    resp = yield deferToThread(urlopen, url)
    json = loads(resp.read())
    returnValue(json)
def parse_conf(conf):
    url = utils.get_abspath(conf.URL)
    path = conf.path.split('.') if conf.path else []
    return (url, path, None)
Exemple #6
0
def pipe_csv(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches and parses a csv file to yield items. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url
        skip -- number of header rows to skip
        col_mode -- column name source: row=header row(s),
                    custom=defined in col_name
        col_name -- list of custom column names
        col_row_start -- first column header row
        col_row_end -- last column header row
        separator -- column separator

    Yields
    ------
    _OUTPUT : items

    Note:
    Current restrictions:
      separator must be 1 character
      assumes every row has exactly the expected number of fields, as defined
      in the header
    """
    conf = DotDict(conf)
    conf_sep = conf['separator']
    conf_mode = conf['col_mode']
    col_name = conf['col_name']

    for item in _INPUT:
        item = DotDict(item)
        url = utils.get_value(conf['URL'], item, **kwargs)
        url = utils.get_abspath(url)
        separator = utils.get_value(conf_sep, item, encode=True, **kwargs)
        skip = int(utils.get_value(conf['skip'], item, **kwargs))
        col_mode = utils.get_value(conf_mode, item, **kwargs)

        f = urlopen(url)

        if context and context.verbose:
            print "pipe_csv loading:", url

        for i in xrange(skip):
            f.next()

        reader = csv.UnicodeReader(f, delimiter=separator)
        fieldnames = []

        if col_mode == 'custom':
            fieldnames = [DotDict(x).get() for x in col_name]
        else:
            fieldnames = _gen_fieldnames(conf, reader, item, **kwargs)

        for rows in reader:
            yield dict(zip(fieldnames, rows))

        f.close()

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Exemple #7
0
def pipe_fetchpage(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches the content of a given web site as a string.
    Loopable.

    context : pipe2py.Context object
    _INPUT : pipeforever asyncPipe or an iterable of items or fields

    conf : dict
       URL -- url object contain the URL to download
       from -- string from where to start the input
       to -- string to limit the input
       token -- if present, split the input on this token to generate items

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#FetchPage

       TODOS:
        - don't retrieve pages larger than 200k
        - don't retrieve if page is not indexable.
        - item delimiter removes the closing tag if using a HTML tag
          (not documented but happens)
        - items should be cleaned, i.e. stripped of HTML tags

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    split_token = conf.get('token', **kwargs)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)

            if not url:
                continue

            f = urlopen(url)

            # TODO: it seems that Yahoo! converts relative links to
            # absolute. This needs to be done on the content but seems to
            # be a non-trival task python?
            content = unicode(f.read(), 'utf-8')

            if context and context.verbose:
                print '............Content .................'
                print content
                print '...............EOF...................'

            parsed = _parse_content(content, conf, **kwargs)
            items = parsed.split(split_token) if split_token else [parsed]

            if context and context.verbose:
                print "FetchPage: found count items:", len(items)

            for i in items:
                if context and context.verbose:
                    print "--------------item data --------------------"
                    print i
                    print "--------------EOF item data ----------------"

                yield {"content": i}

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Exemple #8
0
def pipe_csv(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches and parses a csv file to yield items. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url
        skip -- number of header rows to skip
        col_mode -- column name source: row=header row(s),
                    custom=defined in col_name
        col_name -- list of custom column names
        col_row_start -- first column header row
        col_row_end -- last column header row
        separator -- column separator

    Yields
    ------
    _OUTPUT : items

    Note:
    Current restrictions:
      separator must be 1 character
      assumes every row has exactly the expected number of fields, as defined
      in the header
    """
    conf = DotDict(conf)
    conf_sep = conf["separator"]
    conf_mode = conf["col_mode"]
    col_name = conf["col_name"]

    for item in _INPUT:
        item = DotDict(item)
        url = utils.get_value(conf["URL"], item, **kwargs)
        url = utils.get_abspath(url)
        separator = utils.get_value(conf_sep, item, encode=True, **kwargs)
        skip = int(utils.get_value(conf["skip"], item, **kwargs))
        col_mode = utils.get_value(conf_mode, item, **kwargs)

        f = urlopen(url)

        if context and context.verbose:
            print "pipe_csv loading:", url

        for i in xrange(skip):
            f.next()

        reader = csv.UnicodeReader(f, delimiter=separator)
        fieldnames = []

        if col_mode == "custom":
            fieldnames = [DotDict(x).get() for x in col_name]
        else:
            fieldnames = _gen_fieldnames(conf, reader, item, **kwargs)

        for rows in reader:
            yield dict(zip(fieldnames, rows))

        f.close()

        if item.get("forever"):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Exemple #9
0
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches the content of a given website as DOM nodes or a
    string. Loopable.

    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : dict
       URL -- url object contain the URL to download
       xpath -- xpath to extract
       html5 -- use html5 parser?
       useAsString -- emit items as string?

       TODOS:
        - don't retrieve pages larger than 1.5MB
        - don't retrieve if page is not indexable.

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)
            f = urlopen(url)

            # TODO: it seems that Yahoo! converts relative links to
            # absolute. This needs to be done on the content but seems to
            # be a non-trival task python?
            content = unicode(f.read(), 'utf-8')

            if context and context.verbose:
                print '............Content .................'
                print content
                print '...............EOF...................'

            xpath = conf.get('xpath', **kwargs)
            html5 = conf.get('html5', **kwargs) == 'true'
            use_as_string = conf.get('useAsString', **kwargs) == 'true'
            tree = html5parser.parse(f) if html5 else html.parse(f)
            root = tree.getroot()
            items = root.xpath(xpath)

            if context and context.verbose:
                print 'XPathFetchPage: found count items:', len(items)

            for etree in items:
                i = utils.etree_to_dict(etree)

                if context and context.verbose:
                    print '--------------item data --------------------'
                    print i
                    print '--------------EOF item data ----------------'

                if use_as_string:
                    yield {'content': unicode(i)}
                else:
                    yield i

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break