コード例 #1
0
def expungeCache():
    """ Expunge old entries from a cache of entries """
    log = planet.logger

    log.info("Determining feed subscriptions")
    entry_count = {}
    sources = config.cache_sources_directory()
    for sub in config.subscriptions():
        data=feedparser.parse(filename(sources,sub))
        if not data.feed.has_key('id'): continue
        if config.feed_options(sub).has_key('cache_keep_entries'):
            entry_count[data.feed.id] = int(config.feed_options(sub)['cache_keep_entries'])
        else:
            entry_count[data.feed.id] = config.cache_keep_entries()

    log.info("Listing cached entries")
    cache = config.cache_directory()
    dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
        if not os.path.isdir(file)]
    dir.sort()
    dir.reverse()

    for mtime,file in dir:

        try:
            entry=minidom.parse(file)
            # determine source of entry
            entry.normalize()
            sources = entry.getElementsByTagName('source')
            if not sources:
                # no source determined, do not delete
                log.debug("No source found for %s", file)
                continue
            ids = sources[0].getElementsByTagName('id')
            if not ids:
                # feed id not found, do not delete
                log.debug("No source feed id found for %s", file)
                continue
            if ids[0].childNodes[0].nodeValue in entry_count:
                # subscribed to feed, update entry count
                entry_count[ids[0].childNodes[0].nodeValue] = entry_count[
                    ids[0].childNodes[0].nodeValue] - 1
                if entry_count[ids[0].childNodes[0].nodeValue] >= 0:
                    # maximum not reached, do not delete
                    log.debug("Maximum not reached for %s from %s",
                        file, ids[0].childNodes[0].nodeValue)
                    continue
                else:
                    # maximum reached
                    log.debug("Removing %s, maximum reached for %s",
                        file, ids[0].childNodes[0].nodeValue)
            else:
                # not subscribed
                log.debug("Removing %s, not subscribed to %s",
                    file, ids[0].childNodes[0].nodeValue)
            # remove old entry
            os.unlink(file)

        except:
            log.error("Error parsing %s", file)
コード例 #2
0
def writeCache(feed_uri, feed_info, data):
    log = planet.logger
    sources = config.cache_sources_directory()
    blacklist = config.cache_blacklist_directory()

    # capture http status
    if not data.has_key("status"):
        if data.has_key("entries") and len(data.entries) > 0:
            data.status = 200
        elif data.bozo and \
            data.bozo_exception.__class__.__name__.lower()=='timeout':
            data.status = 408
        else:
            data.status = 500

    activity_horizon = \
        time.gmtime(time.time()-86400*config.activity_threshold(feed_uri))

    # process based on the HTTP status code
    if data.status == 200 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if data.has_key("entries") and len(data.entries) == 0:
            log.warning("No data %s", feed_uri)
            feed_info.feed['planet_message'] = 'no data'
        elif feed_uri == data.url:
            log.info("Updating feed %s", feed_uri)
        else:
            log.info("Updating feed %s @ %s", feed_uri, data.url)
    elif data.status == 301 and data.has_key("entries") and len(
            data.entries) > 0:
        log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url)
        data.feed['planet_http_location'] = data.url
    elif data.status == 304 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if feed_uri == data.url:
            log.info("Feed %s unchanged", feed_uri)
        else:
            log.info("Feed %s unchanged @ %s", feed_uri, data.url)

        if not feed_info.feed.has_key('planet_message'):
            if feed_info.feed.has_key('planet_updated'):
                updated = feed_info.feed.planet_updated
                if feedparser._parse_date_iso8601(updated) >= activity_horizon:
                    return
        else:
            if feed_info.feed.planet_message.startswith("no activity in"):
                return
            if not feed_info.feed.planet_message.startswith("duplicate") and \
               not feed_info.feed.planet_message.startswith("no data"):
                del feed_info.feed['planet_message']

    elif data.status == 410:
        log.info("Feed %s gone", feed_uri)
    elif data.status == 408:
        log.warning("Feed %s timed out", feed_uri)
    elif data.status >= 400:
        log.error("Error %d while updating feed %s", data.status, feed_uri)
    else:
        log.info("Updating feed %s", feed_uri)

    # if read failed, retain cached information
    if not data.get('version') and feed_info.get('version'):
        data.feed = feed_info.feed
        data.bozo = feed_info.feed.get('planet_bozo', 'true') == 'true'
        data.version = feed_info.feed.get('planet_format')
    data.feed['planet_http_status'] = str(data.status)

    # capture etag and last-modified information
    if data.has_key('headers'):
        if data.has_key('etag') and data.etag:
            data.feed['planet_http_etag'] = data.etag
        elif data.headers.has_key('etag') and data.headers['etag']:
            data.feed['planet_http_etag'] = data.headers['etag']

        if data.headers.has_key('last-modified'):
            data.feed['planet_http_last_modified'] = data.headers[
                'last-modified']
        elif data.has_key('modified') and data.modified:
            data.feed['planet_http_last_modified'] = time.asctime(
                data.modified)

        if data.headers.has_key('-content-hash'):
            data.feed['planet_content_hash'] = data.headers['-content-hash']

    # capture feed and data from the planet configuration file
    if data.get('version'):
        if not data.feed.has_key('links'): data.feed['links'] = list()
        feedtype = 'application/atom+xml'
        if data.version.startswith('rss'): feedtype = 'application/rss+xml'
        if data.version in ['rss090', 'rss10']:
            feedtype = 'application/rdf+xml'
        for link in data.feed.links:
            if link.rel == 'self':
                link['type'] = feedtype
                break
        else:
            data.feed.links.append(
                feedparser.FeedParserDict({
                    'rel': 'self',
                    'type': feedtype,
                    'href': feed_uri
                }))
    for name, value in config.feed_options(feed_uri).items():
        if name == "gravatar" and re.match(re_email, value):
            data.feed['planet_' + name] = md5(value.strip()).hexdigest()
        else:
            data.feed['planet_' + name] = value

    # perform user configured scrub operations on the data
    scrub.scrub(feed_uri, data)

    from planet import idindex
    global index
    if index != None: index = idindex.open()

    # select latest entry for each unique id
    ids = {}
    for entry in data.entries:
        # generate an id, if none is present
        if not entry.has_key('id') or not entry.id:
            entry['id'] = reconstitute.id(None, entry)
        elif hasattr(entry['id'], 'values'):
            entry['id'] = entry['id'].values()[0]
        if not entry['id']: continue

        # determine updated date for purposes of selection
        updated = ''
        if entry.has_key('published'): updated = entry.published
        if entry.has_key('updated'): updated = entry.updated

        # if not seen or newer than last seen, select it
        if updated >= ids.get(entry.id, ('', ))[0]:
            ids[entry.id] = (updated, entry)

    # write each entry to the cache
    cache = config.cache_directory()
    for updated, entry in ids.values():

        # compute blacklist file name based on the id
        blacklist_file = filename(blacklist, entry.id)

        # check if blacklist file exists. If so, skip it.
        if os.path.exists(blacklist_file):
            continue

        # compute cache file name based on the id
        cache_file = filename(cache, entry.id)

        # get updated-date either from the entry or the cache (default to now)
        mtime = None
        if not entry.has_key('updated_parsed') or not entry['updated_parsed']:
            entry['updated_parsed'] = entry.get('published_parsed', None)
        if entry.has_key('updated_parsed'):
            try:
                mtime = calendar.timegm(entry.updated_parsed)
            except:
                pass
        if not mtime:
            try:
                mtime = os.stat(cache_file).st_mtime
            except:
                if data.feed.has_key('updated_parsed'):
                    try:
                        mtime = calendar.timegm(data.feed.updated_parsed)
                    except:
                        pass
        if not mtime: mtime = time.time()
        entry['updated_parsed'] = time.gmtime(mtime)

        # apply any filters
        xdoc = reconstitute.reconstitute(data, entry)
        output = xdoc.toxml().encode('utf-8')
        xdoc.unlink()
        for filter in config.filters(feed_uri):
            output = shell.run(filter, output, mode="filter")
            if not output: break
        if not output:
            if os.path.exists(cache_file): os.remove(cache_file)
            continue

        # write out and timestamp the results
        write(output, cache_file, mtime)

        # optionally index
        if index != None:
            feedid = data.feed.get('id', data.feed.get('link', None))
            if feedid:
                if type(feedid) == unicode: feedid = feedid.encode('utf-8')
                index[filename('', entry.id)] = feedid

    if index: index.close()

    # identify inactive feeds
    if config.activity_threshold(feed_uri):
        updated = [
            entry.updated_parsed for entry in data.entries
            if entry.has_key('updated_parsed')
        ]
        updated.sort()

        if updated:
            data.feed['planet_updated'] = \
                time.strftime("%Y-%m-%dT%H:%M:%SZ", updated[-1])
        elif data.feed.has_key('planet_updated'):
            updated = [
                feedparser._parse_date_iso8601(data.feed.planet_updated)
            ]

        if not updated or updated[-1] < activity_horizon:
            msg = "no activity in %d days" % config.activity_threshold(
                feed_uri)
            log.info(msg)
            data.feed['planet_message'] = msg

    # report channel level errors
    if data.status == 226:
        if data.feed.has_key('planet_message'): del data.feed['planet_message']
        if feed_info.feed.has_key('planet_updated'):
            data.feed['planet_updated'] = feed_info.feed['planet_updated']
    elif data.status == 403:
        data.feed['planet_message'] = "403: forbidden"
    elif data.status == 404:
        data.feed['planet_message'] = "404: not found"
    elif data.status == 408:
        data.feed['planet_message'] = "408: request timeout"
    elif data.status == 410:
        data.feed['planet_message'] = "410: gone"
    elif data.status == 500:
        data.feed['planet_message'] = "internal server error"
    elif data.status >= 400:
        data.feed['planet_message'] = "http status %s" % data.status

    # write the feed info to the cache
    if not os.path.exists(sources): os.makedirs(sources)
    xdoc = minidom.parseString('''<feed xmlns:planet="%s"
      xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
    reconstitute.source(xdoc.documentElement, data.feed, data.bozo,
                        data.version)
    write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri))
    xdoc.unlink()
コード例 #3
0
def spiderPlanet(only_if_new=False):
    """ Spider (fetch) an entire planet """
    log = planet.logger

    global index
    index = True

    timeout = config.feed_timeout()
    try:
        socket.setdefaulttimeout(float(timeout))
        log.info("Socket timeout set to %d seconds", timeout)
    except:
        try:
            import timeoutsocket
            timeoutsocket.setDefaultSocketTimeout(float(timeout))
            log.info("Socket timeout set to %d seconds", timeout)
        except:
            log.warning("Timeout set to invalid value '%s', skipping", timeout)

    from Queue import Queue
    from threading import Thread

    fetch_queue = Queue()
    parse_queue = Queue()

    threads = {}
    http_cache = config.http_cache_directory()
    # Should this be done in config?
    if http_cache and not os.path.exists(http_cache):
        os.makedirs(http_cache)

    if int(config.spider_threads()):
        # Start all the worker threads
        for i in range(int(config.spider_threads())):
            threads[i] = Thread(target=httpThread,
                                args=(i, fetch_queue, parse_queue, log))
            threads[i].start()
    else:
        log.info("Building work queue")

    # Load the fetch and parse work queues
    for uri in config.subscriptions():
        # read cached feed info
        sources = config.cache_sources_directory()
        feed_source = filename(sources, uri)
        feed_info = feedparser.parse(feed_source)

        if feed_info.feed and only_if_new:
            log.info("Feed %s already in cache", uri)
            continue
        if feed_info.feed.get('planet_http_status', None) == '410':
            log.info("Feed %s gone", uri)
            continue

        if threads and _is_http_uri(uri):
            fetch_queue.put(item=(uri, feed_info))
        else:
            parse_queue.put(item=(uri, feed_info, uri))

    # Mark the end of the fetch queue
    for thread in threads.keys():
        fetch_queue.put(item=(None, None))

    # Process the results as they arrive
    feeds_seen = {}
    while fetch_queue.qsize() or parse_queue.qsize() or threads:
        while parse_queue.qsize():
            (uri, feed_info, feed) = parse_queue.get(False)
            try:

                if not hasattr(feed,
                               'headers') or int(feed.headers.status) < 300:
                    options = {}
                    if hasattr(feed_info, 'feed'):
                        options['etag'] = \
                            feed_info.feed.get('planet_http_etag',None)
                        try:
                            modified = time.strptime(
                                feed_info.feed.get('planet_http_last_modified',
                                                   None))
                        except:
                            pass

                    data = feedparser.parse(feed, **options)
                else:
                    data = feedparser.FeedParserDict({
                        'version':
                        None,
                        'headers':
                        feed.headers,
                        'entries': [],
                        'feed': {},
                        'href':
                        feed.url,
                        'bozo':
                        0,
                        'status':
                        int(feed.headers.status)
                    })

                # duplicate feed?
                id = data.feed.get('id', None)
                if not id: id = feed_info.feed.get('id', None)

                href = uri
                if data.has_key('href'): href = data.href

                duplicate = None
                if id and id in feeds_seen:
                    duplicate = id
                elif href and href in feeds_seen:
                    duplicate = href

                if duplicate:
                    feed_info.feed['planet_message'] = \
                        'duplicate subscription: ' + feeds_seen[duplicate]
                    log.warn('Duplicate subscription: %s and %s' %
                             (uri, feeds_seen[duplicate]))
                    if href: feed_info.feed['planet_http_location'] = href

                if id: feeds_seen[id] = uri
                if href: feeds_seen[href] = uri

                # complete processing for the feed
                writeCache(uri, feed_info, data)

            except Exception, e:
                import sys, traceback
                type, value, tb = sys.exc_info()
                log.error('Error processing %s', uri)
                for line in (traceback.format_exception_only(type, value) +
                             traceback.format_tb(tb)):
                    log.error(line.rstrip())

        time.sleep(0.1)

        for index in threads.keys():
            if not threads[index].isAlive():
                del threads[index]
                if not threads:
                    log.info("Finished threaded part of processing.")
コード例 #4
0
ファイル: spider.py プロジェクト: hoonose/aggregator
def writeCache(feed_uri, feed_info, data):
    log = planet.logger
    sources = config.cache_sources_directory()

    # capture http status
    if not data.has_key("status"):
        if data.has_key("entries") and len(data.entries) > 0:
            data.status = 200
        elif data.bozo and \
            data.bozo_exception.__class__.__name__.lower()=='timeout':
            data.status = 408
        else:
            data.status = 500

    activity_horizon = \
        time.gmtime(time.time()-86400*config.activity_threshold(feed_uri))

    # process based on the HTTP status code
    if data.status == 200 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if data.has_key("entries") and len(data.entries) == 0:
            log.warning("No data %s", feed_uri)
            feed_info.feed['planet_message'] = 'no data'
        elif feed_uri == data.url:
            log.info("Updating feed %s", feed_uri)
        else:
            log.info("Updating feed %s @ %s", feed_uri, data.url)
    elif data.status == 301 and data.has_key("entries") and len(
            data.entries) > 0:
        log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url)
        data.feed['planet_http_location'] = data.url
    elif data.status == 304 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if feed_uri == data.url:
            log.info("Feed %s unchanged", feed_uri)
        else:
            log.info("Feed %s unchanged @ %s", feed_uri, data.url)

        if not feed_info.feed.has_key('planet_message'):
            if feed_info.feed.has_key('planet_updated'):
                updated = feed_info.feed.planet_updated
                if feedparser._parse_date_iso8601(updated) >= activity_horizon:
                    return
        else:
            if feed_info.feed.planet_message.startswith("no activity in"):
                return
            if not feed_info.feed.planet_message.startswith("duplicate") and \
               not feed_info.feed.planet_message.startswith("no data"):
                del feed_info.feed['planet_message']

    elif data.status == 410:
        log.info("Feed %s gone", feed_uri)
    elif data.status == 408:
        log.warning("Feed %s timed out", feed_uri)
    elif data.status >= 400:
        log.error("Error %d while updating feed %s", data.status, feed_uri)
    else:
        log.info("Updating feed %s", feed_uri)

    # if read failed, retain cached information
    if not data.version and feed_info.version:
        data.feed = feed_info.feed
        data.bozo = feed_info.feed.get('planet_bozo', 'true') == 'true'
        data.version = feed_info.feed.get('planet_format')
    data.feed['planet_http_status'] = str(data.status)

    # capture etag and last-modified information
    if data.has_key('headers'):
        if data.has_key('etag') and data.etag:
            data.feed['planet_http_etag'] = data.etag
        elif data.headers.has_key('etag') and data.headers['etag']:
            data.feed['planet_http_etag'] = data.headers['etag']

        if data.headers.has_key('last-modified'):
            data.feed['planet_http_last_modified'] = data.headers[
                'last-modified']
        elif data.has_key('modified') and data.modified:
            data.feed['planet_http_last_modified'] = time.asctime(
                data.modified)

        if data.headers.has_key('-content-hash'):
            data.feed['planet_content_hash'] = data.headers['-content-hash']

    # capture feed and data from the planet configuration file
    if data.version:
        if not data.feed.has_key('links'): data.feed['links'] = list()
        feedtype = 'application/atom+xml'
        if data.version.startswith('rss'): feedtype = 'application/rss+xml'
        if data.version in ['rss090', 'rss10']:
            feedtype = 'application/rdf+xml'
        for link in data.feed.links:
            if link.rel == 'self':
                link['type'] = feedtype
                break
        else:
            data.feed.links.append(
                feedparser.FeedParserDict({
                    'rel': 'self',
                    'type': feedtype,
                    'href': feed_uri
                }))
    for name, value in config.feed_options(feed_uri).items():
        data.feed['planet_' + name] = value

    # perform user configured scrub operations on the data
    scrub.scrub(feed_uri, data)

    from planet import idindex
    global index
    if index != None: index = idindex.open()

    # select latest entry for each unique id
    ids = {}
    for entry in data.entries:
        # generate an id, if none is present
        if not entry.has_key('id') or not entry.id:
            entry['id'] = reconstitute.id(None, entry)
            if not entry['id']: continue

        # determine updated date for purposes of selection
        updated = ''
        if entry.has_key('published'): updated = entry.published
        if entry.has_key('updated'): updated = entry.updated

        # if not seen or newer than last seen, select it
        if updated >= ids.get(entry.id, ('', ))[0]:
            ids[entry.id] = (updated, entry)

    # write each entry to the cache
    cache = config.cache_directory()
    for updated, entry in ids.values():

        # compute cache file name based on the id
        cache_file = filename(cache, entry.id)

        # get updated-date either from the entry or the cache (default to now)
        mtime = None
        #FEEDWORLD: published preferred over updated
        if not entry.has_key(
                'published_parsed') or not entry['published_parsed']:
            entry['published_parsed'] = entry.get('updated_parsed', None)
        if entry.has_key('published_parsed'):
            try:
                mtime = calendar.timegm(entry.published_parsed)
            except:
                pass
        if not mtime:
            try:
                mtime = os.stat(cache_file).st_mtime
            except:
                if data.feed.has_key('published_parsed'):
                    try:
                        mtime = calendar.timegm(data.feed.published_parsed)
                    except:
                        pass
        if not mtime: mtime = time.time()
        entry['updated_parsed'] = time.gmtime(mtime)

        # apply any filters
        xdoc = reconstitute.reconstitute(data, entry)
        output = xdoc.toxml().encode('utf-8')
        xdoc.unlink()
        for filter in config.filters(feed_uri):
            output = shell.run(filter, output, mode="filter")
            if not output: break
        if not output:
            if os.path.exists(cache_file): os.remove(cache_file)
            continue

        #FEEDWORLD: ugly hack to get around feeds that don't have timestamps
        try:
            elements = minidom.parseString(output).getElementsByTagName(
                "feedworld_mtime")
            if len(elements) > 0:
                mtime = int(elements[0].firstChild.data)
                #log.info("mtime=%d\n"%mtime)
        except Exception, e:
            log.debug("error: %s\n" % e)

        # write out and timestamp the results
        write(output, cache_file)
        os.utime(cache_file, (mtime, mtime))

        # optionally index
        if index != None:
            feedid = data.feed.get('id', data.feed.get('link', None))
            if feedid:
                if type(feedid) == unicode: feedid = feedid.encode('utf-8')
                index[filename('', entry.id)] = feedid
コード例 #5
0
ファイル: spider.py プロジェクト: ArchLinuxJP/archplanet-jp
def writeCache(feed_uri, feed_info, data):
    log = planet.logger
    sources = config.cache_sources_directory()
    blacklist = config.cache_blacklist_directory()

    # capture http status
    if not data.has_key("status"):
        if data.has_key("entries") and len(data.entries)>0:
            data.status = 200
        elif data.bozo and \
            data.bozo_exception.__class__.__name__.lower()=='timeout':
            data.status = 408
        else:
            data.status = 500

    activity_horizon = \
        time.gmtime(time.time()-86400*config.activity_threshold(feed_uri))

    # process based on the HTTP status code
    if data.status == 200 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if data.has_key("entries") and len(data.entries) == 0:
            log.warning("No data %s", feed_uri)
            feed_info.feed['planet_message'] = 'no data'
        elif feed_uri == data.url:
            log.info("Updating feed %s", feed_uri)
        else:
            log.info("Updating feed %s @ %s", feed_uri, data.url)
    elif data.status == 301 and data.has_key("entries") and len(data.entries)>0:
        log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url)
        data.feed['planet_http_location'] = data.url
    elif data.status == 304 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if feed_uri == data.url:
            log.info("Feed %s unchanged", feed_uri)
        else:
            log.info("Feed %s unchanged @ %s", feed_uri, data.url)

        if not feed_info.feed.has_key('planet_message'):
            if feed_info.feed.has_key('planet_updated'):
                updated = feed_info.feed.planet_updated
                if feedparser._parse_date_iso8601(updated) >= activity_horizon:
                    return
        else:
            if feed_info.feed.planet_message.startswith("no activity in"):
               return
            if not feed_info.feed.planet_message.startswith("duplicate") and \
               not feed_info.feed.planet_message.startswith("no data"):
               del feed_info.feed['planet_message']

    elif data.status == 410:
        log.info("Feed %s gone", feed_uri)
    elif data.status == 408:
        log.warning("Feed %s timed out", feed_uri)
    elif data.status >= 400:
        log.error("Error %d while updating feed %s", data.status, feed_uri)
    else:
        log.info("Updating feed %s", feed_uri)

    # if read failed, retain cached information
    if not data.get('version') and feed_info.get('version'):
        data.feed = feed_info.feed
        data.bozo = feed_info.feed.get('planet_bozo','true') == 'true'
        data.version = feed_info.feed.get('planet_format')
    data.feed['planet_http_status'] = str(data.status)

    # capture etag and last-modified information
    if data.has_key('headers'):
        if data.has_key('etag') and data.etag:
            data.feed['planet_http_etag'] = data.etag
        elif data.headers.has_key('etag') and data.headers['etag']:
            data.feed['planet_http_etag'] =  data.headers['etag']

        if data.headers.has_key('last-modified'):
            data.feed['planet_http_last_modified']=data.headers['last-modified']
        elif data.has_key('modified') and data.modified:
            data.feed['planet_http_last_modified'] = time.asctime(data.modified)

        if data.headers.has_key('-content-hash'):
            data.feed['planet_content_hash'] = data.headers['-content-hash']

    # capture feed and data from the planet configuration file
    if data.get('version'):
        if not data.feed.has_key('links'): data.feed['links'] = list()
        feedtype = 'application/atom+xml'
        if data.version.startswith('rss'): feedtype = 'application/rss+xml'
        if data.version in ['rss090','rss10']: feedtype = 'application/rdf+xml'
        for link in data.feed.links:
            if link.rel == 'self':
                link['type'] = feedtype
                break
        else:
            data.feed.links.append(feedparser.FeedParserDict(
                {'rel':'self', 'type':feedtype, 'href':feed_uri}))
    for name, value in config.feed_options(feed_uri).items():
        data.feed['planet_'+name] = value

    # perform user configured scrub operations on the data
    scrub.scrub(feed_uri, data)

    from planet import idindex
    global index
    if index != None: index = idindex.open()
 
    # select latest entry for each unique id
    ids = {}
    for entry in data.entries:
        # generate an id, if none is present
        if not entry.has_key('id') or not entry.id:
            entry['id'] = reconstitute.id(None, entry)
        elif hasattr(entry['id'], 'values'):
            entry['id'] = entry['id'].values()[0]
        if not entry['id']: continue

        # determine updated date for purposes of selection
        updated = ''
        if entry.has_key('published'): updated=entry.published
        if entry.has_key('updated'):   updated=entry.updated

        # if not seen or newer than last seen, select it
        if updated >= ids.get(entry.id,('',))[0]:
            ids[entry.id] = (updated, entry)

    # write each entry to the cache
    cache = config.cache_directory()
    for updated, entry in ids.values():

        # compute blacklist file name based on the id
        blacklist_file = filename(blacklist, entry.id)  

        # check if blacklist file exists. If so, skip it. 
        if os.path.exists(blacklist_file):
           continue

        # compute cache file name based on the id
        cache_file = filename(cache, entry.id)

        # get updated-date either from the entry or the cache (default to now)
        mtime = None
        if not entry.has_key('updated_parsed') or not entry['updated_parsed']:
            entry['updated_parsed'] = entry.get('published_parsed',None)
        if entry.has_key('updated_parsed'):
            try:
                mtime = calendar.timegm(entry.updated_parsed)
            except:
                pass
        if not mtime:
            try:
                mtime = os.stat(cache_file).st_mtime
            except:
                if data.feed.has_key('updated_parsed'):
                    try:
                        mtime = calendar.timegm(data.feed.updated_parsed)
                    except:
                        pass
        if not mtime: mtime = time.time()
        entry['updated_parsed'] = time.gmtime(mtime)

        # apply any filters
        xdoc = reconstitute.reconstitute(data, entry)
        output = xdoc.toxml().encode('utf-8')
        xdoc.unlink()
        for filter in config.filters(feed_uri):
            output = shell.run(filter, output, mode="filter")
            if not output: break
        if not output:
          if os.path.exists(cache_file): os.remove(cache_file)
          continue

        # write out and timestamp the results
        write(output, cache_file, mtime) 
    
        # optionally index
        if index != None: 
            feedid = data.feed.get('id', data.feed.get('link',None))
            if feedid:
                if type(feedid) == unicode: feedid = feedid.encode('utf-8')
                index[filename('', entry.id)] = feedid

    if index: index.close()

    # identify inactive feeds
    if config.activity_threshold(feed_uri):
        updated = [entry.updated_parsed for entry in data.entries
            if entry.has_key('updated_parsed')]
        updated.sort()

        if updated:
            data.feed['planet_updated'] = \
                time.strftime("%Y-%m-%dT%H:%M:%SZ", updated[-1])
        elif data.feed.has_key('planet_updated'):
           updated = [feedparser._parse_date_iso8601(data.feed.planet_updated)]

        if not updated or updated[-1] < activity_horizon:
            msg = "no activity in %d days" % config.activity_threshold(feed_uri)
            log.info(msg)
            data.feed['planet_message'] = msg

    # report channel level errors
    if data.status == 226:
        if data.feed.has_key('planet_message'): del data.feed['planet_message']
        if feed_info.feed.has_key('planet_updated'):
            data.feed['planet_updated'] = feed_info.feed['planet_updated']
    elif data.status == 403:
        data.feed['planet_message'] = "403: forbidden"
    elif data.status == 404:
        data.feed['planet_message'] = "404: not found"
    elif data.status == 408:
        data.feed['planet_message'] = "408: request timeout"
    elif data.status == 410:
        data.feed['planet_message'] = "410: gone"
    elif data.status == 500:
        data.feed['planet_message'] = "internal server error"
    elif data.status >= 400:
        data.feed['planet_message'] = "http status %s" % data.status

    # write the feed info to the cache
    if not os.path.exists(sources): os.makedirs(sources)
    xdoc=minidom.parseString('''<feed xmlns:planet="%s"
      xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
    reconstitute.source(xdoc.documentElement,data.feed,data.bozo,data.version)
    write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri))
    xdoc.unlink()
コード例 #6
0
ファイル: spider.py プロジェクト: ArchLinuxJP/archplanet-jp
def spiderPlanet(only_if_new = False):
    """ Spider (fetch) an entire planet """
    log = planet.logger

    global index
    index = True

    timeout = config.feed_timeout()
    try:
        socket.setdefaulttimeout(float(timeout))
        log.info("Socket timeout set to %d seconds", timeout)
    except:
        try:
            import timeoutsocket
            timeoutsocket.setDefaultSocketTimeout(float(timeout))
            log.info("Socket timeout set to %d seconds", timeout)
        except:
            log.warning("Timeout set to invalid value '%s', skipping", timeout)

    from Queue import Queue
    from threading import Thread

    fetch_queue = Queue()
    parse_queue = Queue()

    threads = {}
    http_cache = config.http_cache_directory()
    # Should this be done in config?
    if http_cache and not os.path.exists(http_cache):
        os.makedirs(http_cache)


    if int(config.spider_threads()):
        # Start all the worker threads
        for i in range(int(config.spider_threads())):
            threads[i] = Thread(target=httpThread,
                args=(i,fetch_queue, parse_queue, log))
            threads[i].start()
    else:
        log.info("Building work queue")

    # Load the fetch and parse work queues
    for uri in config.subscriptions():
        # read cached feed info
        sources = config.cache_sources_directory()
        feed_source = filename(sources, uri)
        feed_info = feedparser.parse(feed_source)

        if feed_info.feed and only_if_new:
            log.info("Feed %s already in cache", uri)
            continue
        if feed_info.feed.get('planet_http_status',None) == '410':
            log.info("Feed %s gone", uri)
            continue

        if threads and _is_http_uri(uri):
            fetch_queue.put(item=(uri, feed_info))
        else:
            parse_queue.put(item=(uri, feed_info, uri))

    # Mark the end of the fetch queue
    for thread in threads.keys():
        fetch_queue.put(item=(None, None))

    # Process the results as they arrive
    feeds_seen = {}
    while fetch_queue.qsize() or parse_queue.qsize() or threads:
        while parse_queue.qsize():
            (uri, feed_info, feed) = parse_queue.get(False)
            try:

                if not hasattr(feed,'headers') or int(feed.headers.status)<300:
                    options = {}
                    if hasattr(feed_info,'feed'):
                        options['etag'] = \
                            feed_info.feed.get('planet_http_etag',None)
                        try:
                            modified=time.strptime(
                                feed_info.feed.get('planet_http_last_modified',
                                None))
                        except:
                            pass

                    data = feedparser.parse(feed, **options)
                else:
                    data = feedparser.FeedParserDict({'version': None,
                        'headers': feed.headers, 'entries': [], 'feed': {},
                        'href': feed.url, 'bozo': 0,
                        'status': int(feed.headers.status)})

                # duplicate feed?
                id = data.feed.get('id', None)
                if not id: id = feed_info.feed.get('id', None)

                href=uri
                if data.has_key('href'): href=data.href

                duplicate = None
                if id and id in feeds_seen:
                   duplicate = id
                elif href and href in feeds_seen:
                   duplicate = href

                if duplicate:
                    feed_info.feed['planet_message'] = \
                        'duplicate subscription: ' + feeds_seen[duplicate]
                    log.warn('Duplicate subscription: %s and %s' %
                        (uri, feeds_seen[duplicate]))
                    if href: feed_info.feed['planet_http_location'] = href

                if id: feeds_seen[id] = uri
                if href: feeds_seen[href] = uri

                # complete processing for the feed
                writeCache(uri, feed_info, data)

            except Exception, e:
                import sys, traceback
                type, value, tb = sys.exc_info()
                log.error('Error processing %s', uri)
                for line in (traceback.format_exception_only(type, value) +
                    traceback.format_tb(tb)):
                    log.error(line.rstrip())

        time.sleep(0.1)

        for index in threads.keys():
            if not threads[index].isAlive():
                del threads[index]
                if not threads:
                    log.info("Finished threaded part of processing.")
コード例 #7
0
def splice():
    """ Splice together a planet from a cache of entries """
    import planet
    log = planet.logger

    log.info("Loading cached data")
    cache = config.cache_directory()
    dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*")
           if not os.path.isdir(file)]
    dir.sort()
    dir.reverse()

    max_items = max([
        config.items_per_page(templ)
        for templ in config.template_files() or ['Planet']
    ])

    doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>')
    feed = doc.documentElement

    # insert feed information
    createTextElement(feed, 'title', config.name())
    date(feed, 'updated', time.gmtime())
    gen = createTextElement(feed, 'generator', config.generator())
    gen.setAttribute('uri', config.generator_uri())

    author = doc.createElement('author')
    createTextElement(author, 'name', config.owner_name())
    createTextElement(author, 'email', config.owner_email())
    feed.appendChild(author)

    if config.feed():
        createTextElement(feed, 'id', config.feed())
        link = doc.createElement('link')
        link.setAttribute('rel', 'self')
        link.setAttribute('href', config.feed())
        if config.feedtype():
            link.setAttribute('type', "application/%s+xml" % config.feedtype())
        feed.appendChild(link)

    if config.link():
        link = doc.createElement('link')
        link.setAttribute('rel', 'alternate')
        link.setAttribute('href', config.link())
        feed.appendChild(link)

    # insert subscription information
    sub_ids = []
    feed.setAttribute('xmlns:planet', planet.xmlns)
    sources = config.cache_sources_directory()
    for sub in config.subscriptions():
        data = feedparser.parse(filename(sources, sub))
        if data.feed.has_key('id'): sub_ids.append(data.feed.id)
        if not data.feed: continue
        xdoc = minidom.parseString('''<planet:source xmlns:planet="%s"
             xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
        reconstitute.source(xdoc.documentElement, data.feed, None, None)
        feed.appendChild(xdoc.documentElement)

    index = idindex.open()

    # insert entry information
    items = 0
    count = {}
    new_feed_items = config.new_feed_items()
    for mtime, file in dir:
        if index != None:
            base = os.path.basename(file)
            if index.has_key(base) and index[base] not in sub_ids: continue

        try:
            entry = minidom.parse(file)

            # verify that this entry is currently subscribed to and that the
            # number of entries contributed by this feed does not exceed
            # config.new_feed_items
            entry.normalize()
            sources = entry.getElementsByTagName('source')
            if sources:
                ids = sources[0].getElementsByTagName('id')
                if ids:
                    id = ids[0].childNodes[0].nodeValue
                    count[id] = count.get(id, 0) + 1
                    if new_feed_items and count[id] > new_feed_items: continue

                    if id not in sub_ids:
                        ids = sources[0].getElementsByTagName('planet:id')
                        if not ids: continue
                        id = ids[0].childNodes[0].nodeValue
                        if id not in sub_ids: continue

            # add entry to feed
            feed.appendChild(entry.documentElement)
            items = items + 1
            if items >= max_items: break
        except:
            log.error("Error parsing %s", file)

    if index: index.close()

    return doc
コード例 #8
0
def splice():
    """ Splice together a planet from a cache of entries """
    import planet
    log = planet.logger

    log.info("Loading cached data")
    cache = config.cache_directory()
    dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
        if not os.path.isdir(file)]
    dir.sort()
    dir.reverse()

    max_items=max([config.items_per_page(templ)
        for templ in config.template_files() or ['Planet']])

    doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>')
    feed = doc.documentElement

    # insert feed information
    createTextElement(feed, 'title', config.name())
    date(feed, 'updated', time.gmtime())    
    gen = createTextElement(feed, 'generator', config.generator())
    gen.setAttribute('uri', config.generator_uri())

    author = doc.createElement('author')
    createTextElement(author, 'name', config.owner_name())
    createTextElement(author, 'email', config.owner_email())
    feed.appendChild(author)

    if config.feed():
        createTextElement(feed, 'id', config.feed())
        link = doc.createElement('link')
        link.setAttribute('rel', 'self')
        link.setAttribute('href', config.feed())
        if config.feedtype():
            link.setAttribute('type', "application/%s+xml" % config.feedtype())
        feed.appendChild(link)

    if config.pubsubhubbub_hub():
        hub = doc.createElement('link')
        hub.setAttribute('rel', 'hub')
        hub.setAttribute('href', config.pubsubhubbub_hub())
        feed.appendChild(hub)

    if config.link():
        link = doc.createElement('link')
        link.setAttribute('rel', 'alternate')
        link.setAttribute('href', config.link())
        feed.appendChild(link)

    # insert subscription information
    sub_ids = []
    feed.setAttribute('xmlns:planet',planet.xmlns)
    sources = config.cache_sources_directory()
    for sub in config.subscriptions():
        data=feedparser.parse(filename(sources,sub))
        if data.feed.has_key('id'): sub_ids.append(data.feed.id)
        if not data.feed: continue

        # warn on missing links
        if not data.feed.has_key('planet_message'):
            if not data.feed.has_key('links'): data.feed['links'] = []

            for link in data.feed.links:
              if link.rel == 'self': break
            else:
              log.debug('missing self link for ' + sub)

            for link in data.feed.links:
              if link.rel == 'alternate' and 'html' in link.type: break
            else:
              log.debug('missing html link for ' + sub)

        xdoc=minidom.parseString('''<planet:source xmlns:planet="%s"
             xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
        reconstitute.source(xdoc.documentElement, data.feed, None, None)
        feed.appendChild(xdoc.documentElement)

    index = idindex.open()

    # insert entry information
    items = 0
    count = {}
    atomNS='http://www.w3.org/2005/Atom'
    new_feed_items = config.new_feed_items()
    for mtime,file in dir:
        if index != None:
            base = os.path.basename(file)
            if index.has_key(base) and index[base] not in sub_ids: continue

        try:
            entry=minidom.parse(file)

            # verify that this entry is currently subscribed to and that the
            # number of entries contributed by this feed does not exceed
            # config.new_feed_items
            entry.normalize()
            sources = entry.getElementsByTagNameNS(atomNS, 'source')
            if sources:
                ids = sources[0].getElementsByTagName('id')
                if ids:
                    id = ids[0].childNodes[0].nodeValue
                    count[id] = count.get(id,0) + 1
                    if new_feed_items and count[id] > new_feed_items: continue

                    if id not in sub_ids:
                        ids = sources[0].getElementsByTagName('planet:id')
                        if not ids: continue
                        id = ids[0].childNodes[0].nodeValue
                        if id not in sub_ids:
                          log.warn('Skipping: ' + id)
                        if id not in sub_ids: continue

            # add entry to feed
            feed.appendChild(entry.documentElement)
            items = items + 1
            if items >= max_items: break
        except:
            log.error("Error parsing %s", file)

    if index: index.close()

    return doc
コード例 #9
0
def expungeCache():
    """ Expunge old entries from a cache of entries """
    log = planet.logger

    log.info("Determining feed subscriptions")
    entry_count = {}
    sources = config.cache_sources_directory()
    for sub in config.subscriptions():
        data = feedparser.parse(filename(sources, sub))
        if not data.feed.has_key('id'): continue
        if config.feed_options(sub).has_key('cache_keep_entries'):
            entry_count[data.feed.id] = int(
                config.feed_options(sub)['cache_keep_entries'])
        else:
            entry_count[data.feed.id] = config.cache_keep_entries()

    log.info("Listing cached entries")
    cache = config.cache_directory()
    dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*")
           if not os.path.isdir(file)]
    dir.sort()
    dir.reverse()

    for mtime, file in dir:

        try:
            entry = minidom.parse(file)
            # determine source of entry
            entry.normalize()
            sources = entry.getElementsByTagName('source')
            if not sources:
                # no source determined, do not delete
                log.debug("No source found for %s", file)
                continue
            ids = sources[0].getElementsByTagName('id')
            if not ids:
                # feed id not found, do not delete
                log.debug("No source feed id found for %s", file)
                continue
            if ids[0].childNodes[0].nodeValue in entry_count:
                # subscribed to feed, update entry count
                entry_count[ids[0].childNodes[0].nodeValue] = entry_count[
                    ids[0].childNodes[0].nodeValue] - 1
                if entry_count[ids[0].childNodes[0].nodeValue] >= 0:
                    # maximum not reached, do not delete
                    log.debug("Maximum not reached for %s from %s", file,
                              ids[0].childNodes[0].nodeValue)
                    continue
                else:
                    # maximum reached
                    log.debug("Removing %s, maximum reached for %s", file,
                              ids[0].childNodes[0].nodeValue)
            else:
                # not subscribed
                log.debug("Removing %s, not subscribed to %s", file,
                          ids[0].childNodes[0].nodeValue)
            # remove old entry
            os.unlink(file)

        except:
            log.error("Error parsing %s", file)
コード例 #10
0
def splice():
    """ Splice together a planet from a cache of entries """
    import planet
    log = planet.logger

    log.info("Loading cached data")
    cache = config.cache_directory()
    dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*")
           if not os.path.isdir(file)]
    dir.sort()
    dir.reverse()

    max_items = max([
        config.items_per_page(templ)
        for templ in config.template_files() or ['Planet']
    ])

    doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>')
    feed = doc.documentElement

    # insert feed information
    createTextElement(feed, 'title', config.name())
    date(feed, 'updated', time.gmtime())
    gen = createTextElement(feed, 'generator', config.generator())
    gen.setAttribute('uri', config.generator_uri())

    author = doc.createElement('author')
    createTextElement(author, 'name', config.owner_name())
    createTextElement(author, 'email', config.owner_email())
    feed.appendChild(author)

    if config.feed():
        createTextElement(feed, 'id', config.feed())
        link = doc.createElement('link')
        link.setAttribute('rel', 'self')
        link.setAttribute('href', config.feed())
        if config.feedtype():
            link.setAttribute('type', "application/%s+xml" % config.feedtype())
        feed.appendChild(link)

    if config.pubsubhubbub_hub():
        hub = doc.createElement('link')
        hub.setAttribute('rel', 'hub')
        hub.setAttribute('href', config.pubsubhubbub_hub())
        feed.appendChild(hub)

    if config.link():
        link = doc.createElement('link')
        link.setAttribute('rel', 'alternate')
        link.setAttribute('href', config.link())
        feed.appendChild(link)

    # insert subscription information
    sub_ids = []
    feed.setAttribute('xmlns:planet', planet.xmlns)
    sources = config.cache_sources_directory()
    for sub in config.subscriptions():
        data = feedparser.parse(filename(sources, sub))
        if data.feed.has_key('id'):
            sub_ids.append(data.feed.id)
        if not data.feed:
            continue

        # warn on missing links
        if not data.feed.has_key('planet_message'):
            if not data.feed.has_key('links'):
                data.feed['links'] = []

            for link in data.feed.links:
                if link.rel == 'self':
                    break
            else:
                log.debug('missing self link for ' + sub)

            for link in data.feed.links:
                if link.rel == 'alternate' and 'html' in link.type:
                    break
            else:
                log.debug('missing html link for ' + sub)

        xdoc = minidom.parseString('''<planet:source xmlns:planet="%s"
             xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
        reconstitute.source(xdoc.documentElement, data.feed, None, None)
        feed.appendChild(xdoc.documentElement)

    index = idindex.open()

    # insert entry information
    items = 0
    count = {}
    atomNS = 'http://www.w3.org/2005/Atom'
    new_feed_items = config.new_feed_items()

    posted_urls = set()
    if config.post_to_twitter():
        if os.path.exists(posted_urls_file):
            try:
                with open(posted_urls_file, 'rb') as f:
                    posted_urls = pickle.load(f)
            except Exception as ex:
                log.error("Error reading posted_urls %s", ex)
#    print(posted_urls)

    for mtime, file in dir:
        if index != None:
            base = os.path.basename(file)
            if index.has_key(base) and index[base] not in sub_ids:
                continue

        try:
            entry = minidom.parse(file)

            # verify that this entry is currently subscribed to and that the
            # number of entries contributed by this feed does not exceed
            # config.new_feed_items
            entry.normalize()
            sources = entry.getElementsByTagNameNS(atomNS, 'source')
            if sources:
                ids = sources[0].getElementsByTagName('id')
                if ids:
                    id = ids[0].childNodes[0].nodeValue
                    count[id] = count.get(id, 0) + 1
                    if new_feed_items and count[id] > new_feed_items:
                        continue

                    if id not in sub_ids:
                        ids = sources[0].getElementsByTagName('planet:id')
                        if not ids:
                            continue
                        id = ids[0].childNodes[0].nodeValue
                        if id not in sub_ids:
                            log.warn('Skipping: ' + id)
                        if id not in sub_ids:
                            continue

# Twitter integration
            if config.post_to_twitter():
                url = None
                twitter = None
                title = "Untitled post..."
                links = entry.getElementsByTagName('link')
                if links:
                    for link in links:
                        if link.hasAttribute('rel') and link.hasAttribute(
                                'type') and link.hasAttribute('href'):
                            if (link.getAttribute('rel') == 'alternate' and
                                    link.getAttribute('type') == 'text/html'):
                                url = link.getAttribute('href')
                                break

                titles = entry.getElementsByTagName('title')
                if titles:
                    title = unicode(
                        titles[0].firstChild.nodeValue.encode('utf-8'),
                        'utf-8').strip()
                handles = entry.getElementsByTagName('planet:twitter')
                if (handles):
                    twitter = unicode(
                        handles[0].firstChild.nodeValue.encode('utf-8'),
                        "utf-8")

                if url is not None and url not in posted_urls:
                    #		    log.debug("Going to post URL to Twitter: twitter='{}' title='{}', url='{}'".format(twitter, title, url))
                    txt_append = u''
                    if twitter:
                        txt_append = u" (by @" + twitter.encode(
                            'utf-8').strip() + u")"
                    max_title_len = 280 - 20 - len(txt_append)
                    if (len(title) > max_title_len):
                        title = title[:max_title_len]
                    txt = title + txt_append + u"\n" + url

                    log.debug(u"Text to post '{}'".format(txt))
                    try:
                        posted_urls.add(url)
                        config.twitter_api.update_status(txt)
                    except Exception as ex:
                        log.error(u"Error posting to Twitter: %s", ex)

            # add entry to feed
            feed.appendChild(entry.documentElement)
            items = items + 1
            if items >= max_items:
                break
        except Exception as ex:
            log.error("Error parsing %s: %s", file, ex)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=2,
                                      file=sys.stdout)

    if config.post_to_twitter():
        with open(posted_urls_file, 'wb') as f:
            pickle.dump(posted_urls, f, protocol=pickle.HIGHEST_PROTOCOL)

    if index:
        index.close()

    return doc
コード例 #11
0
ファイル: splice.py プロジェクト: 3rdandUrban-dev/Nuxleus
def splice():
    """ Splice together a planet from a cache of entries """
    import planet
    log = planet.getLogger(config.log_level(),config.log_format())

    log.info("Loading cached data")
    cache = config.cache_directory()
    dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
        if not os.path.isdir(file)]
    dir.sort()
    dir.reverse()

    max_items=max([config.items_per_page(templ)
        for templ in config.template_files() or ['Planet']])

    doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>')
    feed = doc.documentElement

    # insert feed information
    createTextElement(feed, 'title', config.name())
    date(feed, 'updated', time.gmtime())    
    gen = createTextElement(feed, 'generator', config.generator())
    gen.setAttribute('uri', config.generator_uri())

    author = doc.createElement('author')
    createTextElement(author, 'name', config.owner_name())
    createTextElement(author, 'email', config.owner_email())
    feed.appendChild(author)

    if config.feed():
        createTextElement(feed, 'id', config.feed())
        link = doc.createElement('link')
        link.setAttribute('rel', 'self')
        link.setAttribute('href', config.feed())
        if config.feedtype():
            link.setAttribute('type', "application/%s+xml" % config.feedtype())
        feed.appendChild(link)

    if config.link():
        link = doc.createElement('link')
        link.setAttribute('rel', 'alternate')
        link.setAttribute('href', config.link())
        feed.appendChild(link)

    # insert subscription information
    sub_ids = []
    feed.setAttribute('xmlns:planet',planet.xmlns)
    sources = config.cache_sources_directory()
    for sub in config.subscriptions():
        data=feedparser.parse(filename(sources,sub))
        if data.feed.has_key('id'): sub_ids.append(data.feed.id)
        if not data.feed: continue
        xdoc=minidom.parseString('''<planet:source xmlns:planet="%s"
             xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
        reconstitute.source(xdoc.documentElement, data.feed, None, None)
        feed.appendChild(xdoc.documentElement)

    index = idindex.open()

    # insert entry information
    items = 0
    for mtime,file in dir:
        if index != None:
            base = os.path.basename(file)
            if index.has_key(base) and index[base] not in sub_ids: continue

        try:
            entry=minidom.parse(file)

            # verify that this entry is currently subscribed to
            entry.normalize()
            sources = entry.getElementsByTagName('source')
            if sources:
                ids = sources[0].getElementsByTagName('id')
                if ids and ids[0].childNodes[0].nodeValue not in sub_ids:
                    ids = sources[0].getElementsByTagName('planet:id')
                    if not ids: continue
                    if ids[0].childNodes[0].nodeValue not in sub_ids: continue

            # add entry to feed
            feed.appendChild(entry.documentElement)
            items = items + 1
            if items >= max_items: break
        except:
            log.error("Error parsing %s", file)

    if index: index.close()

    return doc