コード例 #1
0
def test_149():
    # Test the problem with feedparser module directly.
    f = feedparser.parse("")
    assert f.bozo

    # The first version of this patch had a bug that would raise a
    # ``TypeError`` on the following call (whenever an exception
    # occured during feed download, e.g. a normal IOError, host not
    # available). We can simulate this by simply passing None.
    f = feedparser.parse(None)
    assert f.bozo
コード例 #2
0
def test_native():
    """Test the the problem with feedparser module directly."""
    # bozo: final closing ">" chars are missing

    f = feedparser.parse(
        '<rss><channel><item><enclosure href="http://h.com/p/f.mp3" length="10" /></item></channel></rss')
    assert type(f.entries[0].enclosures[0].href) == unicode

    f = feedparser.parse(
        '<rss><channel><itunes:image href="http://image.example.com/a.gif" /></channel></rss')
    assert type(f.feed.image.href) == unicode
コード例 #3
0
ファイル: parse.py プロジェクト: miracle2k/feedplatform
def update_feed(feed, options={}):
    """Parse and update a single feed, as specified by the instance
    of the ``Feed`` model in ``feed``.

    This is the one, main, most important core function, at the
    epicenter of the package, providing different hooks to the rest
    of the world.

    ``options`` can contain any values, and addins may choose to act
    differently depending on what they find there. For example, this
    allows you to support a "full" and "light" mode, whereas
    performance heavy jobs like downloading a feed image are only
    processed when necessary in light mode, but will be forced in
    full mode.
    """

    # instead of adding an additional argument every hook, pass
    # the option along via ``feed``.
    feed._options = options.copy()

    # HOOK: BEFORE_PARSE
    parser_args = {
        'agent': config.USER_AGENT,
        'handlers': list(config.URLLIB2_HANDLERS),
    }
    stop = hooks.trigger('before_parse', args=[feed, parser_args])
    if stop:
        log.info('Feed #%d skipped by addin' % (feed.id))
        return

    # ACTION: PARSE FEED
    log.info('Updating feed #%d: %s' % (feed.id, feed.url))
    # It may be worth noting that FeedParser already IDNA-encodes by
    # itself, but expects the path/query etc. to already be quoted,
    # or it'll screw up the url badly.
    data_dict = feedparser.parse(asciify_url(feed.url), **parser_args)

    # HOOK: AFTER_PARSE
    stop = hooks.trigger('after_parse', args=[feed, data_dict])
    if stop:
        log.info('Feed #%d: Futher processing skipped by addin' % (feed.id))
        return

    # The bozo feature Universal Feed Parser allow it to parse feeds
    # that are not well-formed (http://feedparser.org/docs/bozo.html).
    # While very useful in many cases, it also means that just about
    # anything, from 404 to parking pages will be represented as a
    # feed object with the bozo flag set (about without any useful
    # feed data obviously - for example, the whole page content will
    # be inside the ``subtitle`` field).
    #
    # How do we differentiate between "valid" feed problems like a
    # missing closing tag, that could potentially be ignored while
    # still extracting useful content, and completely invalid data?
    # Simple, we don't. This will be the job of the error handling
    # addin, and should not be our care right now. Suffice to say
    # though that it is important for the addin to make sure that
    # those completely invalid feeds are skipped early so that e.g.
    # a previously valid feed title in the database is not overridden
    # with empty or clearly erroneous data.
    #
    # We will log the problem, though.
    if data_dict.bozo:
        # TODO: add a hook here
        log.warn('Feed #%d bozo: %s' % (feed.id, data_dict.bozo_exception))

    # ACTION: HANDLE ITEMS
    for entry_dict in data_dict.entries:

        # HOOK: ITEM
        stop = hooks.trigger('item', args=[feed, data_dict, entry_dict])
        if stop:
            log.debug('Feed #%d: Item was skipped by addin' % (feed.id))
            continue

        # ACTION: DETERMINE GUID; HOOKS: GET_GUID, NEED_GUID
        #
        # Determine a unique id for the item; this is one of the
        # few fixed requirements that we have: we need a guid.
        # Addins can provide new ways to determine one, but if all
        # fails, we just can't handle the item.
        guid = hooks.trigger('get_guid', args=[feed, entry_dict])
        if not guid:
            guid = entry_dict.get('guid')
        if not guid:
            guid = hooks.trigger('need_guid', args=[feed, entry_dict])

        # HOOK: NO_GUID
        if not guid:
            hooks.trigger('no_guid', args=[feed, entry_dict])
            log.warn('Feed #%d: unable to determine item guid' % (feed.id))
            continue
        else:
            log.debug('Feed #%d: determined item guid "%s"' % (feed.id, guid))


        # ACTION: RESOLVE GUID TO ITEM; HOOKS: GET_ITEM, NEED_ITEM
        #
        # XXX: we need more extensive testing here, with all the variants
        # of returning items, return false etc. involved.
        #
        # Note how each hook result is passed through get_one, since a
        # possible issue when resolving a guid is that for whatever
        # reason the database may contain multiple matching rows. This
        # is a error, and we handle it here for both our default query
        # as well as results delievered via a hook (the latter means
        # that addins don't have to care about this situation
        # themselves).
        try:
            item = db.get_one(hooks.trigger('get_item',
                                            args=[feed, entry_dict, guid]))
            if item is None:
                # does the item already exist for *this feed*?
                item = db.get_one((db.store.find(db.models.Item,
                                                 db.models.Item.feed==feed,
                                                 db.models.Item.guid==guid)))
            if item is None:
                item = db.get_one(hooks.trigger('need_item',
                                                args=[feed, entry_dict, guid]))
        except db.MultipleObjectsReturned:
               # TODO: log a warning/error, provide a hook
               # TODO: test for this case
               return


        if not item:
            # HOOK: CREATE_ITEM
            item = hooks.trigger('create_item', args=[feed, entry_dict, guid])
            if not item:
                item = db.models.Item()
                item.feed = feed
                item.guid = guid

            db.store.add(item)
            # HOOK: NEW_ITEM
            #
            # Note how this happens before flushing(), so that any
            # changes made by this hook will go into the initial
            # INSERT query. If you need an existing primary key, use
            # the process_item hook instead.
            hooks.trigger('new_item', args=[feed, item, entry_dict])

            db.store.flush()
            log.info('Feed #%d: found new item (#%d)' % (feed.id, item.id))
            item_created = True
        else:
            # HOOK: FOUND_ITEM
            hooks.trigger('found_item', args=[feed, item, entry_dict])
            item_created = False

        # HOOK: PROCESS_ITEM
        hooks.trigger('process_item', args=[feed, item, entry_dict, item_created])

        # flush once for each item
        db.store.flush()

    # commit once for each feed
    db.store.commit()