def destroy(): from planet import logger as log cache = config.cache_directory() index = os.path.join(cache, 'index') if not os.path.exists(index): return None idindex = filename(index, 'id') if os.path.exists(idindex): os.unlink(idindex) os.removedirs(index) log.info(idindex + " deleted")
def destroy(): from planet import logger as log cache = config.cache_directory() index=os.path.join(cache,'index') if not os.path.exists(index): return None idindex = filename(index, 'id') if os.path.exists(idindex): os.unlink(idindex) os.removedirs(index) log.info(idindex + " deleted")
def create(): from planet import logger as log cache = config.cache_directory() index = os.path.join(cache, 'index') if not os.path.exists(index): os.makedirs(index) import anydbm index = anydbm.open(filename(index, 'id'), 'c') try: import libxml2 except: libxml2 = False from xml.dom import minidom for file in glob(cache + "/*"): if os.path.isdir(file): continue elif libxml2: try: doc = libxml2.parseFile(file) ctxt = doc.xpathNewContext() ctxt.xpathRegisterNs('atom', 'http://www.w3.org/2005/Atom') entry = ctxt.xpathEval('/atom:entry/atom:id') source = ctxt.xpathEval('/atom:entry/atom:source/atom:id') if entry and source: index[filename('', entry[0].content)] = source[0].content doc.freeDoc() except: log.error(file) else: try: doc = minidom.parse(file) doc.normalize() ids = doc.getElementsByTagName('id') entry = [e for e in ids if e.parentNode.nodeName == 'entry'] source = [e for e in ids if e.parentNode.nodeName == 'source'] if entry and source: index[filename('',entry[0].childNodes[0].nodeValue)] = \ source[0].childNodes[0].nodeValue doc.freeDoc() except: log.error(file) log.info(str(len(index.keys())) + " entries indexed") index.close() return open()
def create(): from planet import logger as log cache = config.cache_directory() index=os.path.join(cache,'index') if not os.path.exists(index): os.makedirs(index) import dbhash index = dbhash.open(filename(index, 'id'),'c') try: import libxml2 except: libxml2 = False from xml.dom import minidom for file in glob(cache+"/*"): if os.path.isdir(file): continue elif libxml2: try: doc = libxml2.parseFile(file) ctxt = doc.xpathNewContext() ctxt.xpathRegisterNs('atom','http://www.w3.org/2005/Atom') entry = ctxt.xpathEval('/atom:entry/atom:id') source = ctxt.xpathEval('/atom:entry/atom:source/atom:id') if entry and source: index[filename('',entry[0].content)] = source[0].content doc.freeDoc() except: log.error(file) else: try: doc = minidom.parse(file) doc.normalize() ids = doc.getElementsByTagName('id') entry = [e for e in ids if e.parentNode.nodeName == 'entry'] source = [e for e in ids if e.parentNode.nodeName == 'source'] if entry and source: index[filename('',entry[0].childNodes[0].nodeValue)] = \ source[0].childNodes[0].nodeValue doc.freeDoc() except: log.error(file) log.info(str(len(index.keys())) + " entries indexed") index.close() return open()
def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=True): from planet import logger import config try: import urllib2, StringIO from planet.spider import filename # list cache file name cache_filename = filename(config.cache_lists_directory(), list) # retrieve list options (e.g., etag, last-modified) from cache options = {} # add original options for key in orig_config.options(list): options[key] = orig_config.get(list, key) try: if use_cache: cached_config = ConfigParser() cached_config.read(cache_filename) for option in cached_config.options(list): options[option] = cached_config.get(list,option) except: pass cached_config = ConfigParser() cached_config.add_section(list) for key, value in options.items(): cached_config.set(list, key, value) # read list curdir=getattr(os.path, 'curdir', '.') if sys.platform.find('win') < 0: base = urljoin('file:', os.path.abspath(curdir)) else: path = os.path.abspath(os.path.curdir) base = urljoin('file:///', path.replace(':','|').replace('\\','/')) request = urllib2.Request(urljoin(base + '/', list)) if options.has_key("etag"): request.add_header('If-None-Match', options['etag']) if options.has_key("last-modified"): request.add_header('If-Modified-Since', options['last-modified']) response = urllib2.urlopen(request) if response.headers.has_key('etag'): cached_config.set(list, 'etag', response.headers['etag']) if response.headers.has_key('last-modified'): cached_config.set(list, 'last-modified', response.headers['last-modified']) # convert to config.ini data = StringIO.StringIO(response.read()) if callback: callback(data, cached_config) # write to cache if use_cache: cache = open(cache_filename, 'w') cached_config.write(cache) cache.close() # re-parse and proceed logger.debug("Using %s readinglist", list) if re_read: if use_cache: orig_config.read(cache_filename) else: cdata = StringIO.StringIO() cached_config.write(cdata) cdata.seek(0) orig_config.readfp(cdata) except: try: if re_read: if use_cache: if not orig_config.read(cache_filename): raise Exception() else: cdata = StringIO.StringIO() cached_config.write(cdata) cdata.seek(0) orig_config.readfp(cdata) logger.info("Using cached %s readinglist", list) except: logger.exception("Unable to read %s readinglist", list)
import planet planet.getLogger('DEBUG',config.log_format()) if not offline: from planet import spider try: spider.spiderPlanet(only_if_new=only_if_new) except Exception, e: print e from planet import splice doc = splice.splice() if debug_splice: from planet import logger logger.info('writing debug.atom') debug=open('debug.atom','w') try: from lxml import etree from StringIO import StringIO tree = etree.tostring(etree.parse(StringIO(doc.toxml()))) debug.write(etree.tostring(tree, pretty_print=True)) except: debug.write(doc.toprettyxml(indent=' ', encoding='utf-8')) debug.close splice.apply(doc.toxml('utf-8')) if config.pubsubhubbub_hub() and not no_publish: from planet import publish publish.publish(config)
source[0].childNodes[0].nodeValue doc.freeDoc() except: log.error(file) log.info(str(len(index.keys())) + " entries indexed") index.close() return open() if __name__ == '__main__': if len(sys.argv) < 2: print 'Usage: %s [-c|-d]' % sys.argv[0] sys.exit(1) config.load(sys.argv[1]) if len(sys.argv) > 2 and sys.argv[2] == '-c': create() elif len(sys.argv) > 2 and sys.argv[2] == '-d': destroy() else: from planet import logger as log index = open() if index: log.info(str(len(index.keys())) + " entries indexed") index.close() else: log.info("no entries indexed")
import planet planet.getLogger('DEBUG', config.log_format()) if not offline: from planet import spider try: spider.spiderPlanet(only_if_new=only_if_new) except Exception, e: print e from planet import splice doc = splice.splice() if debug_splice: from planet import logger logger.info('writing debug.atom') debug = open('debug.atom', 'w') try: from lxml import etree from StringIO import StringIO tree = etree.tostring(etree.parse(StringIO(doc.toxml()))) debug.write(etree.tostring(tree, pretty_print=True)) except: debug.write(doc.toprettyxml(indent=' ', encoding='utf-8')) debug.close splice.apply(doc.toxml('utf-8')) if config.pubsubhubbub_hub() and not no_publish: from planet import publish publish.publish(config)
index[filename('',entry[0].childNodes[0].nodeValue)] = \ source[0].childNodes[0].nodeValue doc.freeDoc() except: log.error(file) log.info(str(len(index.keys())) + " entries indexed") index.close() return open() if __name__ == '__main__': if len(sys.argv) < 2: print 'Usage: %s [-c|-d]' % sys.argv[0] sys.exit(1) config.load(sys.argv[1]) if len(sys.argv) > 2 and sys.argv[2] == '-c': create() elif len(sys.argv) > 2 and sys.argv[2] == '-d': destroy() else: from planet import logger as log index = open() if index: log.info(str(len(index.keys())) + " entries indexed") index.close() else: log.info("no entries indexed")