def expungeCache(): """ Expunge old entries from a cache of entries """ log = planet.logger log.info("Determining feed subscriptions") entry_count = {} sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) if not data.feed.has_key('id'): continue if config.feed_options(sub).has_key('cache_keep_entries'): entry_count[data.feed.id] = int(config.feed_options(sub)['cache_keep_entries']) else: entry_count[data.feed.id] = config.cache_keep_entries() log.info("Listing cached entries") cache = config.cache_directory() dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*") if not os.path.isdir(file)] dir.sort() dir.reverse() for mtime,file in dir: try: entry=minidom.parse(file) # determine source of entry entry.normalize() sources = entry.getElementsByTagName('source') if not sources: # no source determined, do not delete log.debug("No source found for %s", file) continue ids = sources[0].getElementsByTagName('id') if not ids: # feed id not found, do not delete log.debug("No source feed id found for %s", file) continue if ids[0].childNodes[0].nodeValue in entry_count: # subscribed to feed, update entry count entry_count[ids[0].childNodes[0].nodeValue] = entry_count[ ids[0].childNodes[0].nodeValue] - 1 if entry_count[ids[0].childNodes[0].nodeValue] >= 0: # maximum not reached, do not delete log.debug("Maximum not reached for %s from %s", file, ids[0].childNodes[0].nodeValue) continue else: # maximum reached log.debug("Removing %s, maximum reached for %s", file, ids[0].childNodes[0].nodeValue) else: # not subscribed log.debug("Removing %s, not subscribed to %s", file, ids[0].childNodes[0].nodeValue) # remove old entry os.unlink(file) except: log.error("Error parsing %s", file)
def writeCache(feed_uri, feed_info, data): log = planet.logger sources = config.cache_sources_directory() blacklist = config.cache_blacklist_directory() # capture http status if not data.has_key("status"): if data.has_key("entries") and len(data.entries) > 0: data.status = 200 elif data.bozo and \ data.bozo_exception.__class__.__name__.lower()=='timeout': data.status = 408 else: data.status = 500 activity_horizon = \ time.gmtime(time.time()-86400*config.activity_threshold(feed_uri)) # process based on the HTTP status code if data.status == 200 and data.has_key("url"): feed_info.feed['planet_http_location'] = data.url if data.has_key("entries") and len(data.entries) == 0: log.warning("No data %s", feed_uri) feed_info.feed['planet_message'] = 'no data' elif feed_uri == data.url: log.info("Updating feed %s", feed_uri) else: log.info("Updating feed %s @ %s", feed_uri, data.url) elif data.status == 301 and data.has_key("entries") and len( data.entries) > 0: log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url) data.feed['planet_http_location'] = data.url elif data.status == 304 and data.has_key("url"): feed_info.feed['planet_http_location'] = data.url if feed_uri == data.url: log.info("Feed %s unchanged", feed_uri) else: log.info("Feed %s unchanged @ %s", feed_uri, data.url) if not feed_info.feed.has_key('planet_message'): if feed_info.feed.has_key('planet_updated'): updated = feed_info.feed.planet_updated if feedparser._parse_date_iso8601(updated) >= activity_horizon: return else: if feed_info.feed.planet_message.startswith("no activity in"): return if not feed_info.feed.planet_message.startswith("duplicate") and \ not feed_info.feed.planet_message.startswith("no data"): del feed_info.feed['planet_message'] elif data.status == 410: log.info("Feed %s gone", feed_uri) elif data.status == 408: log.warning("Feed %s timed out", feed_uri) elif data.status >= 400: log.error("Error %d while updating feed %s", data.status, feed_uri) else: log.info("Updating feed %s", feed_uri) # if read failed, retain cached information if not data.get('version') and feed_info.get('version'): data.feed = feed_info.feed data.bozo = feed_info.feed.get('planet_bozo', 'true') == 'true' data.version = feed_info.feed.get('planet_format') data.feed['planet_http_status'] = str(data.status) # capture etag and last-modified information if data.has_key('headers'): if data.has_key('etag') and data.etag: data.feed['planet_http_etag'] = data.etag elif data.headers.has_key('etag') and data.headers['etag']: data.feed['planet_http_etag'] = data.headers['etag'] if data.headers.has_key('last-modified'): data.feed['planet_http_last_modified'] = data.headers[ 'last-modified'] elif data.has_key('modified') and data.modified: data.feed['planet_http_last_modified'] = time.asctime( data.modified) if data.headers.has_key('-content-hash'): data.feed['planet_content_hash'] = data.headers['-content-hash'] # capture feed and data from the planet configuration file if data.get('version'): if not data.feed.has_key('links'): data.feed['links'] = list() feedtype = 'application/atom+xml' if data.version.startswith('rss'): feedtype = 'application/rss+xml' if data.version in ['rss090', 'rss10']: feedtype = 'application/rdf+xml' for link in data.feed.links: if link.rel == 'self': link['type'] = feedtype break else: data.feed.links.append( feedparser.FeedParserDict({ 'rel': 'self', 'type': feedtype, 'href': feed_uri })) for name, value in config.feed_options(feed_uri).items(): if name == "gravatar" and re.match(re_email, value): data.feed['planet_' + name] = md5(value.strip()).hexdigest() else: data.feed['planet_' + name] = value # perform user configured scrub operations on the data scrub.scrub(feed_uri, data) from planet import idindex global index if index != None: index = idindex.open() # select latest entry for each unique id ids = {} for entry in data.entries: # generate an id, if none is present if not entry.has_key('id') or not entry.id: entry['id'] = reconstitute.id(None, entry) elif hasattr(entry['id'], 'values'): entry['id'] = entry['id'].values()[0] if not entry['id']: continue # determine updated date for purposes of selection updated = '' if entry.has_key('published'): updated = entry.published if entry.has_key('updated'): updated = entry.updated # if not seen or newer than last seen, select it if updated >= ids.get(entry.id, ('', ))[0]: ids[entry.id] = (updated, entry) # write each entry to the cache cache = config.cache_directory() for updated, entry in ids.values(): # compute blacklist file name based on the id blacklist_file = filename(blacklist, entry.id) # check if blacklist file exists. If so, skip it. if os.path.exists(blacklist_file): continue # compute cache file name based on the id cache_file = filename(cache, entry.id) # get updated-date either from the entry or the cache (default to now) mtime = None if not entry.has_key('updated_parsed') or not entry['updated_parsed']: entry['updated_parsed'] = entry.get('published_parsed', None) if entry.has_key('updated_parsed'): try: mtime = calendar.timegm(entry.updated_parsed) except: pass if not mtime: try: mtime = os.stat(cache_file).st_mtime except: if data.feed.has_key('updated_parsed'): try: mtime = calendar.timegm(data.feed.updated_parsed) except: pass if not mtime: mtime = time.time() entry['updated_parsed'] = time.gmtime(mtime) # apply any filters xdoc = reconstitute.reconstitute(data, entry) output = xdoc.toxml().encode('utf-8') xdoc.unlink() for filter in config.filters(feed_uri): output = shell.run(filter, output, mode="filter") if not output: break if not output: if os.path.exists(cache_file): os.remove(cache_file) continue # write out and timestamp the results write(output, cache_file, mtime) # optionally index if index != None: feedid = data.feed.get('id', data.feed.get('link', None)) if feedid: if type(feedid) == unicode: feedid = feedid.encode('utf-8') index[filename('', entry.id)] = feedid if index: index.close() # identify inactive feeds if config.activity_threshold(feed_uri): updated = [ entry.updated_parsed for entry in data.entries if entry.has_key('updated_parsed') ] updated.sort() if updated: data.feed['planet_updated'] = \ time.strftime("%Y-%m-%dT%H:%M:%SZ", updated[-1]) elif data.feed.has_key('planet_updated'): updated = [ feedparser._parse_date_iso8601(data.feed.planet_updated) ] if not updated or updated[-1] < activity_horizon: msg = "no activity in %d days" % config.activity_threshold( feed_uri) log.info(msg) data.feed['planet_message'] = msg # report channel level errors if data.status == 226: if data.feed.has_key('planet_message'): del data.feed['planet_message'] if feed_info.feed.has_key('planet_updated'): data.feed['planet_updated'] = feed_info.feed['planet_updated'] elif data.status == 403: data.feed['planet_message'] = "403: forbidden" elif data.status == 404: data.feed['planet_message'] = "404: not found" elif data.status == 408: data.feed['planet_message'] = "408: request timeout" elif data.status == 410: data.feed['planet_message'] = "410: gone" elif data.status == 500: data.feed['planet_message'] = "internal server error" elif data.status >= 400: data.feed['planet_message'] = "http status %s" % data.status # write the feed info to the cache if not os.path.exists(sources): os.makedirs(sources) xdoc = minidom.parseString('''<feed xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, data.bozo, data.version) write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri)) xdoc.unlink()
def spiderPlanet(only_if_new=False): """ Spider (fetch) an entire planet """ log = planet.logger global index index = True timeout = config.feed_timeout() try: socket.setdefaulttimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: try: import timeoutsocket timeoutsocket.setDefaultSocketTimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: log.warning("Timeout set to invalid value '%s', skipping", timeout) from Queue import Queue from threading import Thread fetch_queue = Queue() parse_queue = Queue() threads = {} http_cache = config.http_cache_directory() # Should this be done in config? if http_cache and not os.path.exists(http_cache): os.makedirs(http_cache) if int(config.spider_threads()): # Start all the worker threads for i in range(int(config.spider_threads())): threads[i] = Thread(target=httpThread, args=(i, fetch_queue, parse_queue, log)) threads[i].start() else: log.info("Building work queue") # Load the fetch and parse work queues for uri in config.subscriptions(): # read cached feed info sources = config.cache_sources_directory() feed_source = filename(sources, uri) feed_info = feedparser.parse(feed_source) if feed_info.feed and only_if_new: log.info("Feed %s already in cache", uri) continue if feed_info.feed.get('planet_http_status', None) == '410': log.info("Feed %s gone", uri) continue if threads and _is_http_uri(uri): fetch_queue.put(item=(uri, feed_info)) else: parse_queue.put(item=(uri, feed_info, uri)) # Mark the end of the fetch queue for thread in threads.keys(): fetch_queue.put(item=(None, None)) # Process the results as they arrive feeds_seen = {} while fetch_queue.qsize() or parse_queue.qsize() or threads: while parse_queue.qsize(): (uri, feed_info, feed) = parse_queue.get(False) try: if not hasattr(feed, 'headers') or int(feed.headers.status) < 300: options = {} if hasattr(feed_info, 'feed'): options['etag'] = \ feed_info.feed.get('planet_http_etag',None) try: modified = time.strptime( feed_info.feed.get('planet_http_last_modified', None)) except: pass data = feedparser.parse(feed, **options) else: data = feedparser.FeedParserDict({ 'version': None, 'headers': feed.headers, 'entries': [], 'feed': {}, 'href': feed.url, 'bozo': 0, 'status': int(feed.headers.status) }) # duplicate feed? id = data.feed.get('id', None) if not id: id = feed_info.feed.get('id', None) href = uri if data.has_key('href'): href = data.href duplicate = None if id and id in feeds_seen: duplicate = id elif href and href in feeds_seen: duplicate = href if duplicate: feed_info.feed['planet_message'] = \ 'duplicate subscription: ' + feeds_seen[duplicate] log.warn('Duplicate subscription: %s and %s' % (uri, feeds_seen[duplicate])) if href: feed_info.feed['planet_http_location'] = href if id: feeds_seen[id] = uri if href: feeds_seen[href] = uri # complete processing for the feed writeCache(uri, feed_info, data) except Exception, e: import sys, traceback type, value, tb = sys.exc_info() log.error('Error processing %s', uri) for line in (traceback.format_exception_only(type, value) + traceback.format_tb(tb)): log.error(line.rstrip()) time.sleep(0.1) for index in threads.keys(): if not threads[index].isAlive(): del threads[index] if not threads: log.info("Finished threaded part of processing.")
def writeCache(feed_uri, feed_info, data): log = planet.logger sources = config.cache_sources_directory() # capture http status if not data.has_key("status"): if data.has_key("entries") and len(data.entries) > 0: data.status = 200 elif data.bozo and \ data.bozo_exception.__class__.__name__.lower()=='timeout': data.status = 408 else: data.status = 500 activity_horizon = \ time.gmtime(time.time()-86400*config.activity_threshold(feed_uri)) # process based on the HTTP status code if data.status == 200 and data.has_key("url"): feed_info.feed['planet_http_location'] = data.url if data.has_key("entries") and len(data.entries) == 0: log.warning("No data %s", feed_uri) feed_info.feed['planet_message'] = 'no data' elif feed_uri == data.url: log.info("Updating feed %s", feed_uri) else: log.info("Updating feed %s @ %s", feed_uri, data.url) elif data.status == 301 and data.has_key("entries") and len( data.entries) > 0: log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url) data.feed['planet_http_location'] = data.url elif data.status == 304 and data.has_key("url"): feed_info.feed['planet_http_location'] = data.url if feed_uri == data.url: log.info("Feed %s unchanged", feed_uri) else: log.info("Feed %s unchanged @ %s", feed_uri, data.url) if not feed_info.feed.has_key('planet_message'): if feed_info.feed.has_key('planet_updated'): updated = feed_info.feed.planet_updated if feedparser._parse_date_iso8601(updated) >= activity_horizon: return else: if feed_info.feed.planet_message.startswith("no activity in"): return if not feed_info.feed.planet_message.startswith("duplicate") and \ not feed_info.feed.planet_message.startswith("no data"): del feed_info.feed['planet_message'] elif data.status == 410: log.info("Feed %s gone", feed_uri) elif data.status == 408: log.warning("Feed %s timed out", feed_uri) elif data.status >= 400: log.error("Error %d while updating feed %s", data.status, feed_uri) else: log.info("Updating feed %s", feed_uri) # if read failed, retain cached information if not data.version and feed_info.version: data.feed = feed_info.feed data.bozo = feed_info.feed.get('planet_bozo', 'true') == 'true' data.version = feed_info.feed.get('planet_format') data.feed['planet_http_status'] = str(data.status) # capture etag and last-modified information if data.has_key('headers'): if data.has_key('etag') and data.etag: data.feed['planet_http_etag'] = data.etag elif data.headers.has_key('etag') and data.headers['etag']: data.feed['planet_http_etag'] = data.headers['etag'] if data.headers.has_key('last-modified'): data.feed['planet_http_last_modified'] = data.headers[ 'last-modified'] elif data.has_key('modified') and data.modified: data.feed['planet_http_last_modified'] = time.asctime( data.modified) if data.headers.has_key('-content-hash'): data.feed['planet_content_hash'] = data.headers['-content-hash'] # capture feed and data from the planet configuration file if data.version: if not data.feed.has_key('links'): data.feed['links'] = list() feedtype = 'application/atom+xml' if data.version.startswith('rss'): feedtype = 'application/rss+xml' if data.version in ['rss090', 'rss10']: feedtype = 'application/rdf+xml' for link in data.feed.links: if link.rel == 'self': link['type'] = feedtype break else: data.feed.links.append( feedparser.FeedParserDict({ 'rel': 'self', 'type': feedtype, 'href': feed_uri })) for name, value in config.feed_options(feed_uri).items(): data.feed['planet_' + name] = value # perform user configured scrub operations on the data scrub.scrub(feed_uri, data) from planet import idindex global index if index != None: index = idindex.open() # select latest entry for each unique id ids = {} for entry in data.entries: # generate an id, if none is present if not entry.has_key('id') or not entry.id: entry['id'] = reconstitute.id(None, entry) if not entry['id']: continue # determine updated date for purposes of selection updated = '' if entry.has_key('published'): updated = entry.published if entry.has_key('updated'): updated = entry.updated # if not seen or newer than last seen, select it if updated >= ids.get(entry.id, ('', ))[0]: ids[entry.id] = (updated, entry) # write each entry to the cache cache = config.cache_directory() for updated, entry in ids.values(): # compute cache file name based on the id cache_file = filename(cache, entry.id) # get updated-date either from the entry or the cache (default to now) mtime = None #FEEDWORLD: published preferred over updated if not entry.has_key( 'published_parsed') or not entry['published_parsed']: entry['published_parsed'] = entry.get('updated_parsed', None) if entry.has_key('published_parsed'): try: mtime = calendar.timegm(entry.published_parsed) except: pass if not mtime: try: mtime = os.stat(cache_file).st_mtime except: if data.feed.has_key('published_parsed'): try: mtime = calendar.timegm(data.feed.published_parsed) except: pass if not mtime: mtime = time.time() entry['updated_parsed'] = time.gmtime(mtime) # apply any filters xdoc = reconstitute.reconstitute(data, entry) output = xdoc.toxml().encode('utf-8') xdoc.unlink() for filter in config.filters(feed_uri): output = shell.run(filter, output, mode="filter") if not output: break if not output: if os.path.exists(cache_file): os.remove(cache_file) continue #FEEDWORLD: ugly hack to get around feeds that don't have timestamps try: elements = minidom.parseString(output).getElementsByTagName( "feedworld_mtime") if len(elements) > 0: mtime = int(elements[0].firstChild.data) #log.info("mtime=%d\n"%mtime) except Exception, e: log.debug("error: %s\n" % e) # write out and timestamp the results write(output, cache_file) os.utime(cache_file, (mtime, mtime)) # optionally index if index != None: feedid = data.feed.get('id', data.feed.get('link', None)) if feedid: if type(feedid) == unicode: feedid = feedid.encode('utf-8') index[filename('', entry.id)] = feedid
def writeCache(feed_uri, feed_info, data): log = planet.logger sources = config.cache_sources_directory() blacklist = config.cache_blacklist_directory() # capture http status if not data.has_key("status"): if data.has_key("entries") and len(data.entries)>0: data.status = 200 elif data.bozo and \ data.bozo_exception.__class__.__name__.lower()=='timeout': data.status = 408 else: data.status = 500 activity_horizon = \ time.gmtime(time.time()-86400*config.activity_threshold(feed_uri)) # process based on the HTTP status code if data.status == 200 and data.has_key("url"): feed_info.feed['planet_http_location'] = data.url if data.has_key("entries") and len(data.entries) == 0: log.warning("No data %s", feed_uri) feed_info.feed['planet_message'] = 'no data' elif feed_uri == data.url: log.info("Updating feed %s", feed_uri) else: log.info("Updating feed %s @ %s", feed_uri, data.url) elif data.status == 301 and data.has_key("entries") and len(data.entries)>0: log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url) data.feed['planet_http_location'] = data.url elif data.status == 304 and data.has_key("url"): feed_info.feed['planet_http_location'] = data.url if feed_uri == data.url: log.info("Feed %s unchanged", feed_uri) else: log.info("Feed %s unchanged @ %s", feed_uri, data.url) if not feed_info.feed.has_key('planet_message'): if feed_info.feed.has_key('planet_updated'): updated = feed_info.feed.planet_updated if feedparser._parse_date_iso8601(updated) >= activity_horizon: return else: if feed_info.feed.planet_message.startswith("no activity in"): return if not feed_info.feed.planet_message.startswith("duplicate") and \ not feed_info.feed.planet_message.startswith("no data"): del feed_info.feed['planet_message'] elif data.status == 410: log.info("Feed %s gone", feed_uri) elif data.status == 408: log.warning("Feed %s timed out", feed_uri) elif data.status >= 400: log.error("Error %d while updating feed %s", data.status, feed_uri) else: log.info("Updating feed %s", feed_uri) # if read failed, retain cached information if not data.get('version') and feed_info.get('version'): data.feed = feed_info.feed data.bozo = feed_info.feed.get('planet_bozo','true') == 'true' data.version = feed_info.feed.get('planet_format') data.feed['planet_http_status'] = str(data.status) # capture etag and last-modified information if data.has_key('headers'): if data.has_key('etag') and data.etag: data.feed['planet_http_etag'] = data.etag elif data.headers.has_key('etag') and data.headers['etag']: data.feed['planet_http_etag'] = data.headers['etag'] if data.headers.has_key('last-modified'): data.feed['planet_http_last_modified']=data.headers['last-modified'] elif data.has_key('modified') and data.modified: data.feed['planet_http_last_modified'] = time.asctime(data.modified) if data.headers.has_key('-content-hash'): data.feed['planet_content_hash'] = data.headers['-content-hash'] # capture feed and data from the planet configuration file if data.get('version'): if not data.feed.has_key('links'): data.feed['links'] = list() feedtype = 'application/atom+xml' if data.version.startswith('rss'): feedtype = 'application/rss+xml' if data.version in ['rss090','rss10']: feedtype = 'application/rdf+xml' for link in data.feed.links: if link.rel == 'self': link['type'] = feedtype break else: data.feed.links.append(feedparser.FeedParserDict( {'rel':'self', 'type':feedtype, 'href':feed_uri})) for name, value in config.feed_options(feed_uri).items(): data.feed['planet_'+name] = value # perform user configured scrub operations on the data scrub.scrub(feed_uri, data) from planet import idindex global index if index != None: index = idindex.open() # select latest entry for each unique id ids = {} for entry in data.entries: # generate an id, if none is present if not entry.has_key('id') or not entry.id: entry['id'] = reconstitute.id(None, entry) elif hasattr(entry['id'], 'values'): entry['id'] = entry['id'].values()[0] if not entry['id']: continue # determine updated date for purposes of selection updated = '' if entry.has_key('published'): updated=entry.published if entry.has_key('updated'): updated=entry.updated # if not seen or newer than last seen, select it if updated >= ids.get(entry.id,('',))[0]: ids[entry.id] = (updated, entry) # write each entry to the cache cache = config.cache_directory() for updated, entry in ids.values(): # compute blacklist file name based on the id blacklist_file = filename(blacklist, entry.id) # check if blacklist file exists. If so, skip it. if os.path.exists(blacklist_file): continue # compute cache file name based on the id cache_file = filename(cache, entry.id) # get updated-date either from the entry or the cache (default to now) mtime = None if not entry.has_key('updated_parsed') or not entry['updated_parsed']: entry['updated_parsed'] = entry.get('published_parsed',None) if entry.has_key('updated_parsed'): try: mtime = calendar.timegm(entry.updated_parsed) except: pass if not mtime: try: mtime = os.stat(cache_file).st_mtime except: if data.feed.has_key('updated_parsed'): try: mtime = calendar.timegm(data.feed.updated_parsed) except: pass if not mtime: mtime = time.time() entry['updated_parsed'] = time.gmtime(mtime) # apply any filters xdoc = reconstitute.reconstitute(data, entry) output = xdoc.toxml().encode('utf-8') xdoc.unlink() for filter in config.filters(feed_uri): output = shell.run(filter, output, mode="filter") if not output: break if not output: if os.path.exists(cache_file): os.remove(cache_file) continue # write out and timestamp the results write(output, cache_file, mtime) # optionally index if index != None: feedid = data.feed.get('id', data.feed.get('link',None)) if feedid: if type(feedid) == unicode: feedid = feedid.encode('utf-8') index[filename('', entry.id)] = feedid if index: index.close() # identify inactive feeds if config.activity_threshold(feed_uri): updated = [entry.updated_parsed for entry in data.entries if entry.has_key('updated_parsed')] updated.sort() if updated: data.feed['planet_updated'] = \ time.strftime("%Y-%m-%dT%H:%M:%SZ", updated[-1]) elif data.feed.has_key('planet_updated'): updated = [feedparser._parse_date_iso8601(data.feed.planet_updated)] if not updated or updated[-1] < activity_horizon: msg = "no activity in %d days" % config.activity_threshold(feed_uri) log.info(msg) data.feed['planet_message'] = msg # report channel level errors if data.status == 226: if data.feed.has_key('planet_message'): del data.feed['planet_message'] if feed_info.feed.has_key('planet_updated'): data.feed['planet_updated'] = feed_info.feed['planet_updated'] elif data.status == 403: data.feed['planet_message'] = "403: forbidden" elif data.status == 404: data.feed['planet_message'] = "404: not found" elif data.status == 408: data.feed['planet_message'] = "408: request timeout" elif data.status == 410: data.feed['planet_message'] = "410: gone" elif data.status == 500: data.feed['planet_message'] = "internal server error" elif data.status >= 400: data.feed['planet_message'] = "http status %s" % data.status # write the feed info to the cache if not os.path.exists(sources): os.makedirs(sources) xdoc=minidom.parseString('''<feed xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement,data.feed,data.bozo,data.version) write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri)) xdoc.unlink()
def spiderPlanet(only_if_new = False): """ Spider (fetch) an entire planet """ log = planet.logger global index index = True timeout = config.feed_timeout() try: socket.setdefaulttimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: try: import timeoutsocket timeoutsocket.setDefaultSocketTimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: log.warning("Timeout set to invalid value '%s', skipping", timeout) from Queue import Queue from threading import Thread fetch_queue = Queue() parse_queue = Queue() threads = {} http_cache = config.http_cache_directory() # Should this be done in config? if http_cache and not os.path.exists(http_cache): os.makedirs(http_cache) if int(config.spider_threads()): # Start all the worker threads for i in range(int(config.spider_threads())): threads[i] = Thread(target=httpThread, args=(i,fetch_queue, parse_queue, log)) threads[i].start() else: log.info("Building work queue") # Load the fetch and parse work queues for uri in config.subscriptions(): # read cached feed info sources = config.cache_sources_directory() feed_source = filename(sources, uri) feed_info = feedparser.parse(feed_source) if feed_info.feed and only_if_new: log.info("Feed %s already in cache", uri) continue if feed_info.feed.get('planet_http_status',None) == '410': log.info("Feed %s gone", uri) continue if threads and _is_http_uri(uri): fetch_queue.put(item=(uri, feed_info)) else: parse_queue.put(item=(uri, feed_info, uri)) # Mark the end of the fetch queue for thread in threads.keys(): fetch_queue.put(item=(None, None)) # Process the results as they arrive feeds_seen = {} while fetch_queue.qsize() or parse_queue.qsize() or threads: while parse_queue.qsize(): (uri, feed_info, feed) = parse_queue.get(False) try: if not hasattr(feed,'headers') or int(feed.headers.status)<300: options = {} if hasattr(feed_info,'feed'): options['etag'] = \ feed_info.feed.get('planet_http_etag',None) try: modified=time.strptime( feed_info.feed.get('planet_http_last_modified', None)) except: pass data = feedparser.parse(feed, **options) else: data = feedparser.FeedParserDict({'version': None, 'headers': feed.headers, 'entries': [], 'feed': {}, 'href': feed.url, 'bozo': 0, 'status': int(feed.headers.status)}) # duplicate feed? id = data.feed.get('id', None) if not id: id = feed_info.feed.get('id', None) href=uri if data.has_key('href'): href=data.href duplicate = None if id and id in feeds_seen: duplicate = id elif href and href in feeds_seen: duplicate = href if duplicate: feed_info.feed['planet_message'] = \ 'duplicate subscription: ' + feeds_seen[duplicate] log.warn('Duplicate subscription: %s and %s' % (uri, feeds_seen[duplicate])) if href: feed_info.feed['planet_http_location'] = href if id: feeds_seen[id] = uri if href: feeds_seen[href] = uri # complete processing for the feed writeCache(uri, feed_info, data) except Exception, e: import sys, traceback type, value, tb = sys.exc_info() log.error('Error processing %s', uri) for line in (traceback.format_exception_only(type, value) + traceback.format_tb(tb)): log.error(line.rstrip()) time.sleep(0.1) for index in threads.keys(): if not threads[index].isAlive(): del threads[index] if not threads: log.info("Finished threaded part of processing.")
def splice(): """ Splice together a planet from a cache of entries """ import planet log = planet.logger log.info("Loading cached data") cache = config.cache_directory() dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*") if not os.path.isdir(file)] dir.sort() dir.reverse() max_items = max([ config.items_per_page(templ) for templ in config.template_files() or ['Planet'] ]) doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>') feed = doc.documentElement # insert feed information createTextElement(feed, 'title', config.name()) date(feed, 'updated', time.gmtime()) gen = createTextElement(feed, 'generator', config.generator()) gen.setAttribute('uri', config.generator_uri()) author = doc.createElement('author') createTextElement(author, 'name', config.owner_name()) createTextElement(author, 'email', config.owner_email()) feed.appendChild(author) if config.feed(): createTextElement(feed, 'id', config.feed()) link = doc.createElement('link') link.setAttribute('rel', 'self') link.setAttribute('href', config.feed()) if config.feedtype(): link.setAttribute('type', "application/%s+xml" % config.feedtype()) feed.appendChild(link) if config.link(): link = doc.createElement('link') link.setAttribute('rel', 'alternate') link.setAttribute('href', config.link()) feed.appendChild(link) # insert subscription information sub_ids = [] feed.setAttribute('xmlns:planet', planet.xmlns) sources = config.cache_sources_directory() for sub in config.subscriptions(): data = feedparser.parse(filename(sources, sub)) if data.feed.has_key('id'): sub_ids.append(data.feed.id) if not data.feed: continue xdoc = minidom.parseString('''<planet:source xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) index = idindex.open() # insert entry information items = 0 count = {} new_feed_items = config.new_feed_items() for mtime, file in dir: if index != None: base = os.path.basename(file) if index.has_key(base) and index[base] not in sub_ids: continue try: entry = minidom.parse(file) # verify that this entry is currently subscribed to and that the # number of entries contributed by this feed does not exceed # config.new_feed_items entry.normalize() sources = entry.getElementsByTagName('source') if sources: ids = sources[0].getElementsByTagName('id') if ids: id = ids[0].childNodes[0].nodeValue count[id] = count.get(id, 0) + 1 if new_feed_items and count[id] > new_feed_items: continue if id not in sub_ids: ids = sources[0].getElementsByTagName('planet:id') if not ids: continue id = ids[0].childNodes[0].nodeValue if id not in sub_ids: continue # add entry to feed feed.appendChild(entry.documentElement) items = items + 1 if items >= max_items: break except: log.error("Error parsing %s", file) if index: index.close() return doc
def splice(): """ Splice together a planet from a cache of entries """ import planet log = planet.logger log.info("Loading cached data") cache = config.cache_directory() dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*") if not os.path.isdir(file)] dir.sort() dir.reverse() max_items=max([config.items_per_page(templ) for templ in config.template_files() or ['Planet']]) doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>') feed = doc.documentElement # insert feed information createTextElement(feed, 'title', config.name()) date(feed, 'updated', time.gmtime()) gen = createTextElement(feed, 'generator', config.generator()) gen.setAttribute('uri', config.generator_uri()) author = doc.createElement('author') createTextElement(author, 'name', config.owner_name()) createTextElement(author, 'email', config.owner_email()) feed.appendChild(author) if config.feed(): createTextElement(feed, 'id', config.feed()) link = doc.createElement('link') link.setAttribute('rel', 'self') link.setAttribute('href', config.feed()) if config.feedtype(): link.setAttribute('type', "application/%s+xml" % config.feedtype()) feed.appendChild(link) if config.pubsubhubbub_hub(): hub = doc.createElement('link') hub.setAttribute('rel', 'hub') hub.setAttribute('href', config.pubsubhubbub_hub()) feed.appendChild(hub) if config.link(): link = doc.createElement('link') link.setAttribute('rel', 'alternate') link.setAttribute('href', config.link()) feed.appendChild(link) # insert subscription information sub_ids = [] feed.setAttribute('xmlns:planet',planet.xmlns) sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) if data.feed.has_key('id'): sub_ids.append(data.feed.id) if not data.feed: continue # warn on missing links if not data.feed.has_key('planet_message'): if not data.feed.has_key('links'): data.feed['links'] = [] for link in data.feed.links: if link.rel == 'self': break else: log.debug('missing self link for ' + sub) for link in data.feed.links: if link.rel == 'alternate' and 'html' in link.type: break else: log.debug('missing html link for ' + sub) xdoc=minidom.parseString('''<planet:source xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) index = idindex.open() # insert entry information items = 0 count = {} atomNS='http://www.w3.org/2005/Atom' new_feed_items = config.new_feed_items() for mtime,file in dir: if index != None: base = os.path.basename(file) if index.has_key(base) and index[base] not in sub_ids: continue try: entry=minidom.parse(file) # verify that this entry is currently subscribed to and that the # number of entries contributed by this feed does not exceed # config.new_feed_items entry.normalize() sources = entry.getElementsByTagNameNS(atomNS, 'source') if sources: ids = sources[0].getElementsByTagName('id') if ids: id = ids[0].childNodes[0].nodeValue count[id] = count.get(id,0) + 1 if new_feed_items and count[id] > new_feed_items: continue if id not in sub_ids: ids = sources[0].getElementsByTagName('planet:id') if not ids: continue id = ids[0].childNodes[0].nodeValue if id not in sub_ids: log.warn('Skipping: ' + id) if id not in sub_ids: continue # add entry to feed feed.appendChild(entry.documentElement) items = items + 1 if items >= max_items: break except: log.error("Error parsing %s", file) if index: index.close() return doc
def expungeCache(): """ Expunge old entries from a cache of entries """ log = planet.logger log.info("Determining feed subscriptions") entry_count = {} sources = config.cache_sources_directory() for sub in config.subscriptions(): data = feedparser.parse(filename(sources, sub)) if not data.feed.has_key('id'): continue if config.feed_options(sub).has_key('cache_keep_entries'): entry_count[data.feed.id] = int( config.feed_options(sub)['cache_keep_entries']) else: entry_count[data.feed.id] = config.cache_keep_entries() log.info("Listing cached entries") cache = config.cache_directory() dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*") if not os.path.isdir(file)] dir.sort() dir.reverse() for mtime, file in dir: try: entry = minidom.parse(file) # determine source of entry entry.normalize() sources = entry.getElementsByTagName('source') if not sources: # no source determined, do not delete log.debug("No source found for %s", file) continue ids = sources[0].getElementsByTagName('id') if not ids: # feed id not found, do not delete log.debug("No source feed id found for %s", file) continue if ids[0].childNodes[0].nodeValue in entry_count: # subscribed to feed, update entry count entry_count[ids[0].childNodes[0].nodeValue] = entry_count[ ids[0].childNodes[0].nodeValue] - 1 if entry_count[ids[0].childNodes[0].nodeValue] >= 0: # maximum not reached, do not delete log.debug("Maximum not reached for %s from %s", file, ids[0].childNodes[0].nodeValue) continue else: # maximum reached log.debug("Removing %s, maximum reached for %s", file, ids[0].childNodes[0].nodeValue) else: # not subscribed log.debug("Removing %s, not subscribed to %s", file, ids[0].childNodes[0].nodeValue) # remove old entry os.unlink(file) except: log.error("Error parsing %s", file)
def splice(): """ Splice together a planet from a cache of entries """ import planet log = planet.logger log.info("Loading cached data") cache = config.cache_directory() dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*") if not os.path.isdir(file)] dir.sort() dir.reverse() max_items = max([ config.items_per_page(templ) for templ in config.template_files() or ['Planet'] ]) doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>') feed = doc.documentElement # insert feed information createTextElement(feed, 'title', config.name()) date(feed, 'updated', time.gmtime()) gen = createTextElement(feed, 'generator', config.generator()) gen.setAttribute('uri', config.generator_uri()) author = doc.createElement('author') createTextElement(author, 'name', config.owner_name()) createTextElement(author, 'email', config.owner_email()) feed.appendChild(author) if config.feed(): createTextElement(feed, 'id', config.feed()) link = doc.createElement('link') link.setAttribute('rel', 'self') link.setAttribute('href', config.feed()) if config.feedtype(): link.setAttribute('type', "application/%s+xml" % config.feedtype()) feed.appendChild(link) if config.pubsubhubbub_hub(): hub = doc.createElement('link') hub.setAttribute('rel', 'hub') hub.setAttribute('href', config.pubsubhubbub_hub()) feed.appendChild(hub) if config.link(): link = doc.createElement('link') link.setAttribute('rel', 'alternate') link.setAttribute('href', config.link()) feed.appendChild(link) # insert subscription information sub_ids = [] feed.setAttribute('xmlns:planet', planet.xmlns) sources = config.cache_sources_directory() for sub in config.subscriptions(): data = feedparser.parse(filename(sources, sub)) if data.feed.has_key('id'): sub_ids.append(data.feed.id) if not data.feed: continue # warn on missing links if not data.feed.has_key('planet_message'): if not data.feed.has_key('links'): data.feed['links'] = [] for link in data.feed.links: if link.rel == 'self': break else: log.debug('missing self link for ' + sub) for link in data.feed.links: if link.rel == 'alternate' and 'html' in link.type: break else: log.debug('missing html link for ' + sub) xdoc = minidom.parseString('''<planet:source xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) index = idindex.open() # insert entry information items = 0 count = {} atomNS = 'http://www.w3.org/2005/Atom' new_feed_items = config.new_feed_items() posted_urls = set() if config.post_to_twitter(): if os.path.exists(posted_urls_file): try: with open(posted_urls_file, 'rb') as f: posted_urls = pickle.load(f) except Exception as ex: log.error("Error reading posted_urls %s", ex) # print(posted_urls) for mtime, file in dir: if index != None: base = os.path.basename(file) if index.has_key(base) and index[base] not in sub_ids: continue try: entry = minidom.parse(file) # verify that this entry is currently subscribed to and that the # number of entries contributed by this feed does not exceed # config.new_feed_items entry.normalize() sources = entry.getElementsByTagNameNS(atomNS, 'source') if sources: ids = sources[0].getElementsByTagName('id') if ids: id = ids[0].childNodes[0].nodeValue count[id] = count.get(id, 0) + 1 if new_feed_items and count[id] > new_feed_items: continue if id not in sub_ids: ids = sources[0].getElementsByTagName('planet:id') if not ids: continue id = ids[0].childNodes[0].nodeValue if id not in sub_ids: log.warn('Skipping: ' + id) if id not in sub_ids: continue # Twitter integration if config.post_to_twitter(): url = None twitter = None title = "Untitled post..." links = entry.getElementsByTagName('link') if links: for link in links: if link.hasAttribute('rel') and link.hasAttribute( 'type') and link.hasAttribute('href'): if (link.getAttribute('rel') == 'alternate' and link.getAttribute('type') == 'text/html'): url = link.getAttribute('href') break titles = entry.getElementsByTagName('title') if titles: title = unicode( titles[0].firstChild.nodeValue.encode('utf-8'), 'utf-8').strip() handles = entry.getElementsByTagName('planet:twitter') if (handles): twitter = unicode( handles[0].firstChild.nodeValue.encode('utf-8'), "utf-8") if url is not None and url not in posted_urls: # log.debug("Going to post URL to Twitter: twitter='{}' title='{}', url='{}'".format(twitter, title, url)) txt_append = u'' if twitter: txt_append = u" (by @" + twitter.encode( 'utf-8').strip() + u")" max_title_len = 280 - 20 - len(txt_append) if (len(title) > max_title_len): title = title[:max_title_len] txt = title + txt_append + u"\n" + url log.debug(u"Text to post '{}'".format(txt)) try: posted_urls.add(url) config.twitter_api.update_status(txt) except Exception as ex: log.error(u"Error posting to Twitter: %s", ex) # add entry to feed feed.appendChild(entry.documentElement) items = items + 1 if items >= max_items: break except Exception as ex: log.error("Error parsing %s: %s", file, ex) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) if config.post_to_twitter(): with open(posted_urls_file, 'wb') as f: pickle.dump(posted_urls, f, protocol=pickle.HIGHEST_PROTOCOL) if index: index.close() return doc
def splice(): """ Splice together a planet from a cache of entries """ import planet log = planet.getLogger(config.log_level(),config.log_format()) log.info("Loading cached data") cache = config.cache_directory() dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*") if not os.path.isdir(file)] dir.sort() dir.reverse() max_items=max([config.items_per_page(templ) for templ in config.template_files() or ['Planet']]) doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>') feed = doc.documentElement # insert feed information createTextElement(feed, 'title', config.name()) date(feed, 'updated', time.gmtime()) gen = createTextElement(feed, 'generator', config.generator()) gen.setAttribute('uri', config.generator_uri()) author = doc.createElement('author') createTextElement(author, 'name', config.owner_name()) createTextElement(author, 'email', config.owner_email()) feed.appendChild(author) if config.feed(): createTextElement(feed, 'id', config.feed()) link = doc.createElement('link') link.setAttribute('rel', 'self') link.setAttribute('href', config.feed()) if config.feedtype(): link.setAttribute('type', "application/%s+xml" % config.feedtype()) feed.appendChild(link) if config.link(): link = doc.createElement('link') link.setAttribute('rel', 'alternate') link.setAttribute('href', config.link()) feed.appendChild(link) # insert subscription information sub_ids = [] feed.setAttribute('xmlns:planet',planet.xmlns) sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) if data.feed.has_key('id'): sub_ids.append(data.feed.id) if not data.feed: continue xdoc=minidom.parseString('''<planet:source xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) index = idindex.open() # insert entry information items = 0 for mtime,file in dir: if index != None: base = os.path.basename(file) if index.has_key(base) and index[base] not in sub_ids: continue try: entry=minidom.parse(file) # verify that this entry is currently subscribed to entry.normalize() sources = entry.getElementsByTagName('source') if sources: ids = sources[0].getElementsByTagName('id') if ids and ids[0].childNodes[0].nodeValue not in sub_ids: ids = sources[0].getElementsByTagName('planet:id') if not ids: continue if ids[0].childNodes[0].nodeValue not in sub_ids: continue # add entry to feed feed.appendChild(entry.documentElement) items = items + 1 if items >= max_items: break except: log.error("Error parsing %s", file) if index: index.close() return doc