Beispiel #1
0
def main():
    config_file = CONFIG_FILE
    offline = 0
    verbose = 0

    for arg in sys.argv[1:]:
        if arg == "-h" or arg == "--help":
            print "Usage: planet [options] [CONFIGFILE]"
            print
            print "Options:"
            print " -v, --verbose       DEBUG level logging during update"
            print " -o, --offline       Update the Planet from the cache only"
            print " -h, --help          Display this help message and exit"
            print
            sys.exit(0)
        elif arg == "-v" or arg == "--verbose":
            verbose = 1
        elif arg == "-o" or arg == "--offline":
            offline = 1
        elif arg.startswith("-"):
            print >>sys.stderr, "Unknown option:", arg
            sys.exit(1)
        else:
            config_file = arg

    # Read the configuration file
    config = ConfigParser()
    config.read(config_file)
    if not config.has_section("Planet"):
        print >>sys.stderr, "Configuration missing [Planet] section."
        sys.exit(1)

    # Read the [Planet] config section
    planet_name = config_get(config, "Planet", "name",        PLANET_NAME)
    planet_link = config_get(config, "Planet", "link",        PLANET_LINK)
    planet_feed = config_get(config, "Planet", "feed",        PLANET_FEED)
    owner_name  = config_get(config, "Planet", "owner_name",  OWNER_NAME)
    owner_email = config_get(config, "Planet", "owner_email", OWNER_EMAIL)
    if verbose:
        log_level = "DEBUG"
    else:
        log_level  = config_get(config, "Planet", "log_level", LOG_LEVEL)
    feed_timeout   = config_get(config, "Planet", "feed_timeout", FEED_TIMEOUT)
    template_files = config_get(config, "Planet", "template_files",
                                TEMPLATE_FILES).split(" ")

    # Default feed to the first feed for which there is a template
    if not planet_feed:
        for template_file in template_files:
            name = os.path.splitext(os.path.basename(template_file))[0]
            if name.find('atom')>=0 or name.find('rss')>=0:
                planet_feed = urlparse.urljoin(planet_link, name)
                break

    # Define locale
    if config.has_option("Planet", "locale"):
        # The user can specify more than one locale (separated by ":") as
        # fallbacks.
        locale_ok = False
        for user_locale in config.get("Planet", "locale").split(':'):
            user_locale = user_locale.strip()
            try:
                locale.setlocale(locale.LC_ALL, user_locale)
            except locale.Error:
                pass
            else:
                locale_ok = True
                break
        if not locale_ok:
            print >>sys.stderr, "Unsupported locale setting."
            sys.exit(1)

    # Activate logging
    planet.logging.basicConfig()
    planet.logging.getLogger().setLevel(planet.logging.getLevelName(log_level))
    log = planet.logging.getLogger("planet.runner")
    try:
        log.warning
    except:
        log.warning = log.warn

    # timeoutsocket allows feedparser to time out rather than hang forever on
    # ultra-slow servers.  Python 2.3 now has this functionality available in
    # the standard socket library, so under 2.3 you don't need to install
    # anything.  But you probably should anyway, because the socket module is
    # buggy and timeoutsocket is better.
    if feed_timeout:
        try:
            feed_timeout = float(feed_timeout)
        except:
            log.warning("Feed timeout set to invalid value '%s', skipping", feed_timeout)
            feed_timeout = None

    if feed_timeout and not offline:
        try:
            from planet import timeoutsocket
            timeoutsocket.setDefaultSocketTimeout(feed_timeout)
            log.debug("Socket timeout set to %d seconds", feed_timeout)
        except ImportError:
            import socket
            if hasattr(socket, 'setdefaulttimeout'):
                log.debug("timeoutsocket not found, using python function")
                socket.setdefaulttimeout(feed_timeout)
                log.debug("Socket timeout set to %d seconds", feed_timeout)
            else:
                log.error("Unable to set timeout to %d seconds", feed_timeout)

    # run the planet
    my_planet = planet.Planet(config)
    my_planet.run(planet_name, planet_link, template_files, offline)

    my_planet.generate_all_files(template_files, planet_name,
        planet_link, planet_feed, owner_name, owner_email)
Beispiel #2
0
def spiderPlanet(only_if_new = False):
    """ Spider (fetch) an entire planet """
    # log = planet.getLogger(config.log_level(),config.log_format())
    log = planet.getLogger(config.log_level(),config.log_format())

    global index
    index = True

    timeout = config.feed_timeout()
    try:
        socket.setdefaulttimeout(float(timeout))
        log.info("Socket timeout set to %d seconds", timeout)
    except:
        try:
            from planet import timeoutsocket
            timeoutsocket.setDefaultSocketTimeout(float(timeout))
            log.info("Socket timeout set to %d seconds", timeout)
        except:
            log.warning("Timeout set to invalid value '%s', skipping", timeout)

    from Queue import Queue
    from threading import Thread

    fetch_queue = Queue()
    parse_queue = Queue()

    threads = {}
    http_cache = config.http_cache_directory()
    # Should this be done in config?
    if http_cache and not os.path.exists(http_cache):
        os.makedirs(http_cache)


    if int(config.spider_threads()):
        # Start all the worker threads
        for i in range(int(config.spider_threads())):
            threads[i] = Thread(target=httpThread,
                args=(i,fetch_queue, parse_queue, log))
            threads[i].start()
    else:
        log.info("Building work queue")

    # Load the fetch and parse work queues
    for uri in config.subscriptions():
        # read cached feed info
        sources = config.cache_sources_directory()
        feed_source = filename(sources, uri)
        feed_info = feedparser.parse(feed_source)

        if feed_info.feed and only_if_new:
            log.info("Feed %s already in cache", uri)
            continue
        if feed_info.feed.get('planet_http_status',None) == '410':
            log.info("Feed %s gone", uri)
            continue

        if threads and _is_http_uri(uri):
            fetch_queue.put(item=(uri, feed_info))
        else:
            parse_queue.put(item=(uri, feed_info, uri))

    # Mark the end of the fetch queue
    for thread in threads.keys():
        fetch_queue.put(item=(None, None))

    # Process the results as they arrive
    while fetch_queue.qsize() or parse_queue.qsize() or threads:
        while parse_queue.qsize() == 0 and threads:
            time.sleep(0.1)
        while parse_queue.qsize():
            (uri, feed_info, feed) = parse_queue.get(False)
            try:

                if not hasattr(feed,'headers') or int(feed.headers.status)<300:
                    options = {}
                    if hasattr(feed_info,'feed'):
                        options['etag'] = \
                            feed_info.feed.get('planet_http_etag',None)
                        try:
                            modified=time.strptime(
                                feed_info.feed.get('planet_http_last_modified',
                                None))
                        except:
                            pass

                    data = feedparser.parse(feed, **options)
                else:
                    data = feedparser.FeedParserDict({'version': None,
                        'headers': feed.headers, 'entries': [], 'feed': {},
                        'bozo': 0, 'status': int(feed.headers.status)})

                writeCache(uri, feed_info, data)

            except Exception, e:
                import sys, traceback
                type, value, tb = sys.exc_info()
                log.error('Error processing %s', uri)
                for line in (traceback.format_exception_only(type, value) +
                    traceback.format_tb(tb)):
                    log.error(line.rstrip())

        for index in threads.keys():
            if not threads[index].isAlive():
                del threads[index]
                if not threads:
                    log.info("Finished threaded part of processing.")
Beispiel #3
0
def main():
    config_file = CONFIG_FILE
    offline = 0
    verbose = 0

    for arg in sys.argv[1:]:
        if arg == "-h" or arg == "--help":
            print "Usage: planet [options] [CONFIGFILE]"
            print
            print "Options:"
            print " -v, --verbose       DEBUG level logging during update"
            print " -o, --offline       Update the Planet from the cache only"
            print " -h, --help          Display this help message and exit"
            print
            sys.exit(0)
        elif arg == "-v" or arg == "--verbose":
            verbose = 1
        elif arg == "-o" or arg == "--offline":
            offline = 1
        elif arg.startswith("-"):
            print >> sys.stderr, "Unknown option:", arg
            sys.exit(1)
        else:
            config_file = arg

    # Read the configuration file
    config = ConfigParser()
    config.read(config_file)
    if not config.has_section("Planet"):
        print >> sys.stderr, "Configuration missing [Planet] section."
        sys.exit(1)

    # Read the [Planet] config section
    planet_name = config_get(config, "Planet", "name", PLANET_NAME)
    planet_link = config_get(config, "Planet", "link", PLANET_LINK)
    planet_feed = config_get(config, "Planet", "feed", PLANET_FEED)
    owner_name = config_get(config, "Planet", "owner_name", OWNER_NAME)
    owner_email = config_get(config, "Planet", "owner_email", OWNER_EMAIL)
    if verbose:
        log_level = "DEBUG"
    else:
        log_level = config_get(config, "Planet", "log_level", LOG_LEVEL)
    feed_timeout = config_get(config, "Planet", "feed_timeout", FEED_TIMEOUT)
    template_files = config_get(config, "Planet", "template_files",
                                TEMPLATE_FILES).split(" ")

    # Default feed to the first feed for which there is a template
    if not planet_feed:
        for template_file in template_files:
            name = os.path.splitext(os.path.basename(template_file))[0]
            if name.find('atom') >= 0 or name.find('rss') >= 0:
                planet_feed = urlparse.urljoin(planet_link, name)
                break

    # Define locale
    if config.has_option("Planet", "locale"):
        # The user can specify more than one locale (separated by ":") as
        # fallbacks.
        locale_ok = False
        for user_locale in config.get("Planet", "locale").split(':'):
            user_locale = user_locale.strip()
            try:
                locale.setlocale(locale.LC_ALL, user_locale)
            except locale.Error:
                pass
            else:
                locale_ok = True
                break
        if not locale_ok:
            print >> sys.stderr, "Unsupported locale setting."
            sys.exit(1)

    # Activate logging
    planet.logging.basicConfig()
    planet.logging.getLogger().setLevel(planet.logging.getLevelName(log_level))
    log = planet.logging.getLogger("planet.runner")
    try:
        log.warning
    except:
        log.warning = log.warn

    # timeoutsocket allows feedparser to time out rather than hang forever on
    # ultra-slow servers.  Python 2.3 now has this functionality available in
    # the standard socket library, so under 2.3 you don't need to install
    # anything.  But you probably should anyway, because the socket module is
    # buggy and timeoutsocket is better.
    if feed_timeout:
        try:
            feed_timeout = float(feed_timeout)
        except:
            log.warning("Feed timeout set to invalid value '%s', skipping",
                        feed_timeout)
            feed_timeout = None

    if feed_timeout and not offline:
        try:
            from planet import timeoutsocket
            timeoutsocket.setDefaultSocketTimeout(feed_timeout)
            log.debug("Socket timeout set to %d seconds", feed_timeout)
        except ImportError:
            import socket
            if hasattr(socket, 'setdefaulttimeout'):
                log.debug("timeoutsocket not found, using python function")
                socket.setdefaulttimeout(feed_timeout)
                log.debug("Socket timeout set to %d seconds", feed_timeout)
            else:
                log.error("Unable to set timeout to %d seconds", feed_timeout)

    # run the planet
    my_planet = planet.Planet(config)
    my_planet.run(planet_name, planet_link, template_files, offline)

    my_planet.generate_all_files(template_files, planet_name, planet_link,
                                 planet_feed, owner_name, owner_email)
Beispiel #4
0
def run_planet(config_file=None, 
               PLANET_NAME="Unconfigured Planet", 
               PLANET_LINK="Unconfigured Planet",
               PLANET_FEED=None,
               OWNER_NAME="Anonymous Coward", 
               OWNER_EMAIL="",
               verbose=0,
               offline=0,
               LOG_LEVEL="WARNING",
               FEED_TIMEOUT=20,
               TEMPLATE_FILES="examples/basic/planet.html.tmpl"):
    # Read the configuration file
    config = ConfigParser()
    config.read(config_file)
    assert config.has_section("Planet"), \
        "Configuration missing [Planet] section."

    # Read the [Planet] config section
    planet_name = config_get(config, "Planet", "name",        PLANET_NAME)
    planet_link = config_get(config, "Planet", "link",        PLANET_LINK)
    planet_feed = config_get(config, "Planet", "feed",        PLANET_FEED)
    owner_name  = config_get(config, "Planet", "owner_name",  OWNER_NAME)
    owner_email = config_get(config, "Planet", "owner_email", OWNER_EMAIL)

    if verbose:
        log_level = "DEBUG"
    else:
        log_level  = config_get(config, "Planet", "log_level", LOG_LEVEL)
    feed_timeout   = config_get(config, "Planet", "feed_timeout", FEED_TIMEOUT)
    template_files = config_get(config, "Planet", "template_files",
                                TEMPLATE_FILES).split(" ")

    # Default feed to the first feed for which there is a template
    if not planet_feed:
        for template_file in template_files:
            name = os.path.splitext(os.path.basename(template_file))[0]
            if name.find('atom')>=0 or name.find('rss')>=0:
                planet_feed = urlparse.urljoin(planet_link, name)
                break

    # Define locale
    if config.has_option("Planet", "locale"):
        # The user can specify more than one locale (separated by ":") as
        # fallbacks.
        locale_ok = False
        for user_locale in config.get("Planet", "locale").split(':'):
            user_locale = user_locale.strip()
            try:
                locale.setlocale(locale.LC_ALL, user_locale)
            except locale.Error:
                pass
            else:
                locale_ok = True
                break
        if not locale_ok:
            print >>sys.stderr, "Unsupported locale setting."
            sys.exit(1)

    # Activate logging
    planet.logging.basicConfig()
    planet.logging.getLogger().setLevel(planet.logging.getLevelName(log_level))
    log = planet.logging.getLogger("planet.runner")
    try:
        log.warning
    except:
        log.warning = log.warn

    # timeoutsocket allows feedparser to time out rather than hang forever on
    # ultra-slow servers.  Python 2.3 now has this functionality available in
    # the standard socket library, so under 2.3 you don't need to install
    # anything.  But you probably should anyway, because the socket module is
    # buggy and timeoutsocket is better.
    if feed_timeout:
        try:
            feed_timeout = float(feed_timeout)
        except:
            log.warning("Feed timeout set to invalid value '%s', skipping", feed_timeout)
            feed_timeout = None

    if feed_timeout and not offline:
        try:
            from planet import timeoutsocket
            timeoutsocket.setDefaultSocketTimeout(feed_timeout)
            log.debug("Socket timeout set to %d seconds", feed_timeout)
        except ImportError:
            import socket
            if hasattr(socket, 'setdefaulttimeout'):
                log.debug("timeoutsocket not found, using python function")
                socket.setdefaulttimeout(feed_timeout)
                log.debug("Socket timeout set to %d seconds", feed_timeout)
            else:
                log.error("Unable to set timeout to %d seconds", feed_timeout)

    # run the planet
    my_planet = planet.Planet(config)
    my_planet.run(planet_name, planet_link, template_files, offline)

    my_planet.generate_all_files(template_files, planet_name,
        planet_link, planet_feed, owner_name, owner_email)