Example #1
0
def httpThread(thread_index, input_queue, output_queue, log):
    import httplib2
    from httplib import BadStatusLine

    h = httplib2.Http(config.http_cache_directory())
    uri, feed_info = input_queue.get(block=True)
    while uri:
        log.info("Fetching %s via %d", uri, thread_index)
        feed = StringIO('')
        setattr(feed, 'url', uri)
        setattr(feed, 'headers', 
            feedparser.FeedParserDict({'status':'500'}))
        try:
            # map IRI => URI
            try:
                if isinstance(uri,unicode):
                    idna = uri.encode('idna')
                else:
                    idna = uri.decode('utf-8').encode('idna')
                if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
            except:
                log.info("unable to map %s to a URI", uri)
                idna = uri

            # cache control headers
            headers = {}
            if feed_info.feed.has_key('planet_http_etag'):
                headers['If-None-Match'] = feed_info.feed['planet_http_etag']
            if feed_info.feed.has_key('planet_http_last_modified'):
                headers['If-Modified-Since'] = \
                    feed_info.feed['planet_http_last_modified']

            headers["user-agent"] = "Venus (+%s)" % config.link()

            # issue request
            (resp, content) = h.request(idna, 'GET', headers=headers)

            # unchanged detection
            resp['-content-hash'] = md5(content or '').hexdigest()
            if resp.status == 200:
                if resp.fromcache:
                    resp.status = 304
                elif feed_info.feed.has_key('planet_content_hash') and \
                    feed_info.feed['planet_content_hash'] == \
                    resp['-content-hash']:
                    resp.status = 304

            # build a file-like object
            feed = StringIO(content) 
            setattr(feed, 'url', resp.get('content-location', uri))
            if resp.has_key('content-encoding'):
                del resp['content-encoding']
            setattr(feed, 'headers', resp)
        except BadStatusLine:
            log.error("Bad Status Line received for %s via %d",
                uri, thread_index)
        except httplib2.HttpLib2Error, e:
            log.error("HttpLib2Error: %s via %d", str(e), thread_index)
        except socket.gaierror, e:
            log.error("socket.gaierror: %s - %s (thread %d)", uri, str(e[1]), thread_index)
Example #2
0
def httpThread(thread_index, input_queue, output_queue, log):
    import httplib2
    from httplib import BadStatusLine

    h = httplib2.Http(config.http_cache_directory())
    uri, feed_info = input_queue.get(block=True)
    while uri:
        log.info("Fetching %s via %d", uri, thread_index)
        feed = StringIO('')
        setattr(feed, 'url', uri)
        setattr(feed, 'headers', feedparser.FeedParserDict({'status': '500'}))
        try:
            # map IRI => URI
            try:
                if isinstance(uri, unicode):
                    idna = uri.encode('idna')
                else:
                    idna = uri.decode('utf-8').encode('idna')
                if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
            except:
                log.info("unable to map %s to a URI", uri)
                idna = uri

            # cache control headers
            headers = {}
            if feed_info.feed.has_key('planet_http_etag'):
                headers['If-None-Match'] = feed_info.feed['planet_http_etag']
            if feed_info.feed.has_key('planet_http_last_modified'):
                headers['If-Modified-Since'] = \
                    feed_info.feed['planet_http_last_modified']

            headers["user-agent"] = "Venus (+%s)" % config.link()

            # issue request
            (resp, content) = h.request(idna, 'GET', headers=headers)

            # unchanged detection
            resp['-content-hash'] = md5(content or '').hexdigest()
            if resp.status == 200:
                if resp.fromcache:
                    resp.status = 304
                elif feed_info.feed.has_key('planet_content_hash') and \
                    feed_info.feed['planet_content_hash'] == \
                    resp['-content-hash']:
                    resp.status = 304

            # build a file-like object
            feed = StringIO(content)
            setattr(feed, 'url', resp.get('content-location', uri))
            if resp.has_key('content-encoding'):
                del resp['content-encoding']
            setattr(feed, 'headers', resp)
        except BadStatusLine:
            log.error("Bad Status Line received for %s via %d", uri,
                      thread_index)
        except httplib2.HttpLib2Error, e:
            log.error("HttpLib2Error: %s via %d", str(e), thread_index)
        except socket.gaierror, e:
            log.error("socket.gaierror: %s - %s (thread %d)", uri, str(e[1]),
                      thread_index)
Example #3
0
def spiderPlanet(only_if_new=False):
    """ Spider (fetch) an entire planet """
    log = planet.logger

    global index
    index = True

    timeout = config.feed_timeout()
    try:
        socket.setdefaulttimeout(float(timeout))
        log.info("Socket timeout set to %d seconds", timeout)
    except:
        try:
            import timeoutsocket
            timeoutsocket.setDefaultSocketTimeout(float(timeout))
            log.info("Socket timeout set to %d seconds", timeout)
        except:
            log.warning("Timeout set to invalid value '%s', skipping", timeout)

    from Queue import Queue
    from threading import Thread

    fetch_queue = Queue()
    parse_queue = Queue()

    threads = {}
    http_cache = config.http_cache_directory()
    # Should this be done in config?
    if http_cache and not os.path.exists(http_cache):
        os.makedirs(http_cache)

    if int(config.spider_threads()):
        # Start all the worker threads
        for i in range(int(config.spider_threads())):
            threads[i] = Thread(target=httpThread,
                                args=(i, fetch_queue, parse_queue, log))
            threads[i].start()
    else:
        log.info("Building work queue")

    # Load the fetch and parse work queues
    for uri in config.subscriptions():
        # read cached feed info
        sources = config.cache_sources_directory()
        feed_source = filename(sources, uri)
        feed_info = feedparser.parse(feed_source)

        if feed_info.feed and only_if_new:
            log.info("Feed %s already in cache", uri)
            continue
        if feed_info.feed.get('planet_http_status', None) == '410':
            log.info("Feed %s gone", uri)
            continue

        if threads and _is_http_uri(uri):
            fetch_queue.put(item=(uri, feed_info))
        else:
            parse_queue.put(item=(uri, feed_info, uri))

    # Mark the end of the fetch queue
    for thread in threads.keys():
        fetch_queue.put(item=(None, None))

    # Process the results as they arrive
    feeds_seen = {}
    while fetch_queue.qsize() or parse_queue.qsize() or threads:
        while parse_queue.qsize():
            (uri, feed_info, feed) = parse_queue.get(False)
            try:

                if not hasattr(feed,
                               'headers') or int(feed.headers.status) < 300:
                    options = {}
                    if hasattr(feed_info, 'feed'):
                        options['etag'] = \
                            feed_info.feed.get('planet_http_etag',None)
                        try:
                            modified = time.strptime(
                                feed_info.feed.get('planet_http_last_modified',
                                                   None))
                        except:
                            pass

                    data = feedparser.parse(feed, **options)
                else:
                    data = feedparser.FeedParserDict({
                        'version':
                        None,
                        'headers':
                        feed.headers,
                        'entries': [],
                        'feed': {},
                        'href':
                        feed.url,
                        'bozo':
                        0,
                        'status':
                        int(feed.headers.status)
                    })

                # duplicate feed?
                id = data.feed.get('id', None)
                if not id: id = feed_info.feed.get('id', None)

                href = uri
                if data.has_key('href'): href = data.href

                duplicate = None
                if id and id in feeds_seen:
                    duplicate = id
                elif href and href in feeds_seen:
                    duplicate = href

                if duplicate:
                    feed_info.feed['planet_message'] = \
                        'duplicate subscription: ' + feeds_seen[duplicate]
                    log.warn('Duplicate subscription: %s and %s' %
                             (uri, feeds_seen[duplicate]))
                    if href: feed_info.feed['planet_http_location'] = href

                if id: feeds_seen[id] = uri
                if href: feeds_seen[href] = uri

                # complete processing for the feed
                writeCache(uri, feed_info, data)

            except Exception, e:
                import sys, traceback
                type, value, tb = sys.exc_info()
                log.error('Error processing %s', uri)
                for line in (traceback.format_exception_only(type, value) +
                             traceback.format_tb(tb)):
                    log.error(line.rstrip())

        time.sleep(0.1)

        for index in threads.keys():
            if not threads[index].isAlive():
                del threads[index]
                if not threads:
                    log.info("Finished threaded part of processing.")
Example #4
0
def spiderPlanet(only_if_new = False):
    """ Spider (fetch) an entire planet """
    log = planet.logger

    global index
    index = True

    timeout = config.feed_timeout()
    try:
        socket.setdefaulttimeout(float(timeout))
        log.info("Socket timeout set to %d seconds", timeout)
    except:
        try:
            import timeoutsocket
            timeoutsocket.setDefaultSocketTimeout(float(timeout))
            log.info("Socket timeout set to %d seconds", timeout)
        except:
            log.warning("Timeout set to invalid value '%s', skipping", timeout)

    from Queue import Queue
    from threading import Thread

    fetch_queue = Queue()
    parse_queue = Queue()

    threads = {}
    http_cache = config.http_cache_directory()
    # Should this be done in config?
    if http_cache and not os.path.exists(http_cache):
        os.makedirs(http_cache)


    if int(config.spider_threads()):
        # Start all the worker threads
        for i in range(int(config.spider_threads())):
            threads[i] = Thread(target=httpThread,
                args=(i,fetch_queue, parse_queue, log))
            threads[i].start()
    else:
        log.info("Building work queue")

    # Load the fetch and parse work queues
    for uri in config.subscriptions():
        # read cached feed info
        sources = config.cache_sources_directory()
        feed_source = filename(sources, uri)
        feed_info = feedparser.parse(feed_source)

        if feed_info.feed and only_if_new:
            log.info("Feed %s already in cache", uri)
            continue
        if feed_info.feed.get('planet_http_status',None) == '410':
            log.info("Feed %s gone", uri)
            continue

        if threads and _is_http_uri(uri):
            fetch_queue.put(item=(uri, feed_info))
        else:
            parse_queue.put(item=(uri, feed_info, uri))

    # Mark the end of the fetch queue
    for thread in threads.keys():
        fetch_queue.put(item=(None, None))

    # Process the results as they arrive
    feeds_seen = {}
    while fetch_queue.qsize() or parse_queue.qsize() or threads:
        while parse_queue.qsize():
            (uri, feed_info, feed) = parse_queue.get(False)
            try:

                if not hasattr(feed,'headers') or int(feed.headers.status)<300:
                    options = {}
                    if hasattr(feed_info,'feed'):
                        options['etag'] = \
                            feed_info.feed.get('planet_http_etag',None)
                        try:
                            modified=time.strptime(
                                feed_info.feed.get('planet_http_last_modified',
                                None))
                        except:
                            pass

                    data = feedparser.parse(feed, **options)
                else:
                    data = feedparser.FeedParserDict({'version': None,
                        'headers': feed.headers, 'entries': [], 'feed': {},
                        'href': feed.url, 'bozo': 0,
                        'status': int(feed.headers.status)})

                # duplicate feed?
                id = data.feed.get('id', None)
                if not id: id = feed_info.feed.get('id', None)

                href=uri
                if data.has_key('href'): href=data.href

                duplicate = None
                if id and id in feeds_seen:
                   duplicate = id
                elif href and href in feeds_seen:
                   duplicate = href

                if duplicate:
                    feed_info.feed['planet_message'] = \
                        'duplicate subscription: ' + feeds_seen[duplicate]
                    log.warn('Duplicate subscription: %s and %s' %
                        (uri, feeds_seen[duplicate]))
                    if href: feed_info.feed['planet_http_location'] = href

                if id: feeds_seen[id] = uri
                if href: feeds_seen[href] = uri

                # complete processing for the feed
                writeCache(uri, feed_info, data)

            except Exception, e:
                import sys, traceback
                type, value, tb = sys.exc_info()
                log.error('Error processing %s', uri)
                for line in (traceback.format_exception_only(type, value) +
                    traceback.format_tb(tb)):
                    log.error(line.rstrip())

        time.sleep(0.1)

        for index in threads.keys():
            if not threads[index].isAlive():
                del threads[index]
                if not threads:
                    log.info("Finished threaded part of processing.")
Example #5
0
def httpThread(thread_index, input_queue, output_queue, log):

    import requests
    from cachecontrol import CacheControl
    from cachecontrol.caches.file_cache import FileCache

    cached_session = CacheControl(requests.session(),
                                  cache=FileCache(
                                      config.http_cache_directory()))

    uri, feed_info = input_queue.get(block=True)
    while uri:
        log.info("Fetching %s via %d", uri, thread_index)
        feed = StringIO('')
        setattr(feed, 'url', uri)
        setattr(feed, 'headers', feedparser.FeedParserDict({'status': '500'}))
        try:
            # map IRI => URI
            try:
                if isinstance(uri, unicode):
                    idna = uri.encode('idna')
                else:
                    idna = uri.decode('utf-8').encode('idna')
                if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
            except:
                log.info("unable to map %s to a URI", uri)
                idna = uri

            # cache control headers
            headers = {}
            if feed_info.feed.has_key('planet_http_etag'):
                headers['If-None-Match'] = feed_info.feed['planet_http_etag']
            if feed_info.feed.has_key('planet_http_last_modified'):
                headers['If-Modified-Since'] = \
                    feed_info.feed['planet_http_last_modified']

            headers['User-Agent'] = 'venus'
            c_req = cached_session.get(idna,
                                       headers=headers,
                                       verify=True,
                                       timeout=30)
            content = c_req.content

            resp = fakeResponse(c_req)

            # unchanged detection
            resp['-content-hash'] = md5(content or '').hexdigest()
            if resp.status == 200:
                if resp.fromcache:
                    resp.status = 304
                elif feed_info.feed.has_key('planet_content_hash') and \
                    feed_info.feed['planet_content_hash'] == \
                    resp['-content-hash']:
                    resp.status = 304

            # build a file-like object
            feed = StringIO(content)
            setattr(feed, 'url', resp.get('content-location', uri))
            if resp.has_key('content-encoding'):
                del resp['content-encoding']
            setattr(feed, 'headers', resp)

        except requests.HTTPError, e:
            log.error("HTTP error when requesting %s: %s", uri, e)
        except requests.ConnectionError, e:
            log.error("Connection Error when requesting %s", uri)
Example #6
0
def httpThread(thread_index, input_queue, output_queue, log):
    import httplib2
    from httplib import BadStatusLine

    h = httplib2.Http(config.http_cache_directory())
    uri, feed_info = input_queue.get(block=True)
    while uri:
        log.info("Fetching %s via %d", uri, thread_index)
        feed = StringIO('')
        setattr(feed, 'url', uri)
        setattr(feed, 'headers', feedparser.FeedParserDict({'status': '500'}))
        try:
            # map IRI => URI
            try:
                if isinstance(uri, unicode):
                    idna = uri.encode('idna')
                else:
                    idna = uri.decode('utf-8').encode('idna')
                if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
            except:
                log.info("unable to map %s to a URI", uri)
                idna = uri

            # cache control headers
            headers = {}
            if feed_info.feed.has_key('planet_http_etag'):
                headers['If-None-Match'] = feed_info.feed['planet_http_etag']
            if feed_info.feed.has_key('planet_http_last_modified'):
                headers['If-Modified-Since'] = \
                    feed_info.feed['planet_http_last_modified']

            headers[
                'user-agent'] = 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0'

            # issue request
            (resp, content) = h.request(idna, 'GET', headers=headers)

            # unchanged detection
            resp['-content-hash'] = md5(content or '').hexdigest()
            if resp.status == 200:
                if resp.fromcache:
                    resp.status = 304
                elif feed_info.feed.has_key('planet_content_hash') and \
                    feed_info.feed['planet_content_hash'] == \
                    resp['-content-hash']:
                    resp.status = 304

            # build a file-like object
            feed = StringIO(content)
            setattr(feed, 'url', resp.get('content-location', uri))
            if resp.has_key('content-encoding'):
                del resp['content-encoding']
            setattr(feed, 'headers', resp)
        except BadStatusLine:
            log.error("Bad Status Line received for %s via %d", uri,
                      thread_index)
        except httplib2.HttpLib2Error, e:
            log.error("HttpLib2Error: %s via %d", str(e), thread_index)
        except socket.error, e:
            if e.__class__.__name__.lower() == 'timeout':
                feed.headers['status'] = '408'
                log.warn("Timeout in thread-%d", thread_index)
            else:
                log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
Example #7
0
def httpThread(thread_index, input_queue, output_queue, log):
    import httplib2, md5
    from socket import gaierror, error 
    from httplib import BadStatusLine

    h = httplib2.Http(config.http_cache_directory())
    uri, feed_info = input_queue.get(block=True)
    while uri:
        log.info("Fetching %s via %d", uri, thread_index)
        feed = StringIO('')
        setattr(feed, 'url', uri)
        setattr(feed, 'headers', 
            feedparser.FeedParserDict({'status':'500'}))
        try:
            # map IRI => URI
            try:
                if isinstance(uri,unicode):
                    idna = uri.encode('idna')
                else:
                    idna = uri.decode('utf-8').encode('idna')
                if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
            except:
                log.info("unable to map %s to a URI", uri)
                idna = uri

            # cache control headers
            headers = {}
            if feed_info.feed.has_key('planet_http_etag'):
                headers['If-None-Match'] = feed_info.feed['planet_http_etag']
            if feed_info.feed.has_key('planet_http_last_modified'):
                headers['If-Modified-Since'] = \
                    feed_info.feed['planet_http_last_modified']

            # issue request
            (resp, content) = h.request(idna, 'GET', headers=headers)

            # unchanged detection
            resp['-content-hash'] = md5.new(content or '').hexdigest()
            if resp.status == 200:
                if resp.fromcache:
                    resp.status = 304
                elif feed_info.feed.has_key('planet_content_hash') and \
                    feed_info.feed['planet_content_hash'] == \
                    resp['-content-hash']:
                    resp.status = 304

            # build a file-like object
            feed = StringIO(content) 
            setattr(feed, 'url', resp.get('content-location', uri))
            if resp.has_key('content-encoding'):
                del resp['content-encoding']
            setattr(feed, 'headers', resp)
        except gaierror:
            log.error("Fail to resolve server name %s via %d",
                uri, thread_index)
        except BadStatusLine:
            log.error("Bad Status Line received for %s via %d",
                uri, thread_index)
        except error, e:
            if e.__class__.__name__.lower()=='timeout':
                feed.headers['status'] = '408'
                log.warn("Timeout in thread-%d", thread_index)
            else:
                log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
        except Exception, e:
            import sys, traceback
            type, value, tb = sys.exc_info()
            log.error('Error processing %s', uri)
            for line in (traceback.format_exception_only(type, value) +
                traceback.format_tb(tb)):
                log.error(line.rstrip())
            continue
Example #8
0
def httpThread(thread_index, input_queue, output_queue, log):
    import httplib2
    from httplib import BadStatusLine

    h = httplib2.Http(config.http_cache_directory())
    uri, feed_info = input_queue.get(block=True)
    while uri:
        log.info("Fetching %s via %d", uri, thread_index)
        feed = StringIO("")
        setattr(feed, "url", uri)
        setattr(feed, "headers", feedparser.FeedParserDict({"status": "500"}))
        try:
            # map IRI => URI
            try:
                if isinstance(uri, unicode):
                    idna = uri.encode("idna")
                else:
                    idna = uri.decode("utf-8").encode("idna")
                if idna != uri:
                    log.info("IRI %s mapped to %s", uri, idna)
            except:
                log.info("unable to map %s to a URI", uri)
                idna = uri

            # cache control headers
            headers = {}
            if feed_info.feed.has_key("planet_http_etag"):
                headers["If-None-Match"] = feed_info.feed["planet_http_etag"]
            if feed_info.feed.has_key("planet_http_last_modified"):
                headers["If-Modified-Since"] = feed_info.feed["planet_http_last_modified"]

            # issue request
            (resp, content) = h.request(idna, "GET", headers=headers)

            # unchanged detection
            resp["-content-hash"] = md5(content or "").hexdigest()
            if resp.status == 200:
                if resp.fromcache:
                    resp.status = 304
                elif (
                    feed_info.feed.has_key("planet_content_hash")
                    and feed_info.feed["planet_content_hash"] == resp["-content-hash"]
                ):
                    resp.status = 304

            # build a file-like object
            feed = StringIO(content)
            setattr(feed, "url", resp.get("content-location", uri))
            if resp.has_key("content-encoding"):
                del resp["content-encoding"]
            setattr(feed, "headers", resp)
        except BadStatusLine:
            log.error("Bad Status Line received for %s via %d", uri, thread_index)
        except httplib2.HttpLib2Error, e:
            log.error("HttpLib2Error: %s via %d", str(e), thread_index)
        except socket.error, e:
            if e.__class__.__name__.lower() == "timeout":
                feed.headers["status"] = "408"
                log.warn("Timeout in thread-%d", thread_index)
            else:
                log.error("HTTP Error: %s in thread-%d", str(e), thread_index)