def httpThread(thread_index, input_queue, output_queue, log): import httplib2 from httplib import BadStatusLine h = httplib2.Http(config.http_cache_directory()) uri, feed_info = input_queue.get(block=True) while uri: log.info("Fetching %s via %d", uri, thread_index) feed = StringIO('') setattr(feed, 'url', uri) setattr(feed, 'headers', feedparser.FeedParserDict({'status':'500'})) try: # map IRI => URI try: if isinstance(uri,unicode): idna = uri.encode('idna') else: idna = uri.decode('utf-8').encode('idna') if idna != uri: log.info("IRI %s mapped to %s", uri, idna) except: log.info("unable to map %s to a URI", uri) idna = uri # cache control headers headers = {} if feed_info.feed.has_key('planet_http_etag'): headers['If-None-Match'] = feed_info.feed['planet_http_etag'] if feed_info.feed.has_key('planet_http_last_modified'): headers['If-Modified-Since'] = \ feed_info.feed['planet_http_last_modified'] headers["user-agent"] = "Venus (+%s)" % config.link() # issue request (resp, content) = h.request(idna, 'GET', headers=headers) # unchanged detection resp['-content-hash'] = md5(content or '').hexdigest() if resp.status == 200: if resp.fromcache: resp.status = 304 elif feed_info.feed.has_key('planet_content_hash') and \ feed_info.feed['planet_content_hash'] == \ resp['-content-hash']: resp.status = 304 # build a file-like object feed = StringIO(content) setattr(feed, 'url', resp.get('content-location', uri)) if resp.has_key('content-encoding'): del resp['content-encoding'] setattr(feed, 'headers', resp) except BadStatusLine: log.error("Bad Status Line received for %s via %d", uri, thread_index) except httplib2.HttpLib2Error, e: log.error("HttpLib2Error: %s via %d", str(e), thread_index) except socket.gaierror, e: log.error("socket.gaierror: %s - %s (thread %d)", uri, str(e[1]), thread_index)
def httpThread(thread_index, input_queue, output_queue, log): import httplib2 from httplib import BadStatusLine h = httplib2.Http(config.http_cache_directory()) uri, feed_info = input_queue.get(block=True) while uri: log.info("Fetching %s via %d", uri, thread_index) feed = StringIO('') setattr(feed, 'url', uri) setattr(feed, 'headers', feedparser.FeedParserDict({'status': '500'})) try: # map IRI => URI try: if isinstance(uri, unicode): idna = uri.encode('idna') else: idna = uri.decode('utf-8').encode('idna') if idna != uri: log.info("IRI %s mapped to %s", uri, idna) except: log.info("unable to map %s to a URI", uri) idna = uri # cache control headers headers = {} if feed_info.feed.has_key('planet_http_etag'): headers['If-None-Match'] = feed_info.feed['planet_http_etag'] if feed_info.feed.has_key('planet_http_last_modified'): headers['If-Modified-Since'] = \ feed_info.feed['planet_http_last_modified'] headers["user-agent"] = "Venus (+%s)" % config.link() # issue request (resp, content) = h.request(idna, 'GET', headers=headers) # unchanged detection resp['-content-hash'] = md5(content or '').hexdigest() if resp.status == 200: if resp.fromcache: resp.status = 304 elif feed_info.feed.has_key('planet_content_hash') and \ feed_info.feed['planet_content_hash'] == \ resp['-content-hash']: resp.status = 304 # build a file-like object feed = StringIO(content) setattr(feed, 'url', resp.get('content-location', uri)) if resp.has_key('content-encoding'): del resp['content-encoding'] setattr(feed, 'headers', resp) except BadStatusLine: log.error("Bad Status Line received for %s via %d", uri, thread_index) except httplib2.HttpLib2Error, e: log.error("HttpLib2Error: %s via %d", str(e), thread_index) except socket.gaierror, e: log.error("socket.gaierror: %s - %s (thread %d)", uri, str(e[1]), thread_index)
def spiderPlanet(only_if_new=False): """ Spider (fetch) an entire planet """ log = planet.logger global index index = True timeout = config.feed_timeout() try: socket.setdefaulttimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: try: import timeoutsocket timeoutsocket.setDefaultSocketTimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: log.warning("Timeout set to invalid value '%s', skipping", timeout) from Queue import Queue from threading import Thread fetch_queue = Queue() parse_queue = Queue() threads = {} http_cache = config.http_cache_directory() # Should this be done in config? if http_cache and not os.path.exists(http_cache): os.makedirs(http_cache) if int(config.spider_threads()): # Start all the worker threads for i in range(int(config.spider_threads())): threads[i] = Thread(target=httpThread, args=(i, fetch_queue, parse_queue, log)) threads[i].start() else: log.info("Building work queue") # Load the fetch and parse work queues for uri in config.subscriptions(): # read cached feed info sources = config.cache_sources_directory() feed_source = filename(sources, uri) feed_info = feedparser.parse(feed_source) if feed_info.feed and only_if_new: log.info("Feed %s already in cache", uri) continue if feed_info.feed.get('planet_http_status', None) == '410': log.info("Feed %s gone", uri) continue if threads and _is_http_uri(uri): fetch_queue.put(item=(uri, feed_info)) else: parse_queue.put(item=(uri, feed_info, uri)) # Mark the end of the fetch queue for thread in threads.keys(): fetch_queue.put(item=(None, None)) # Process the results as they arrive feeds_seen = {} while fetch_queue.qsize() or parse_queue.qsize() or threads: while parse_queue.qsize(): (uri, feed_info, feed) = parse_queue.get(False) try: if not hasattr(feed, 'headers') or int(feed.headers.status) < 300: options = {} if hasattr(feed_info, 'feed'): options['etag'] = \ feed_info.feed.get('planet_http_etag',None) try: modified = time.strptime( feed_info.feed.get('planet_http_last_modified', None)) except: pass data = feedparser.parse(feed, **options) else: data = feedparser.FeedParserDict({ 'version': None, 'headers': feed.headers, 'entries': [], 'feed': {}, 'href': feed.url, 'bozo': 0, 'status': int(feed.headers.status) }) # duplicate feed? id = data.feed.get('id', None) if not id: id = feed_info.feed.get('id', None) href = uri if data.has_key('href'): href = data.href duplicate = None if id and id in feeds_seen: duplicate = id elif href and href in feeds_seen: duplicate = href if duplicate: feed_info.feed['planet_message'] = \ 'duplicate subscription: ' + feeds_seen[duplicate] log.warn('Duplicate subscription: %s and %s' % (uri, feeds_seen[duplicate])) if href: feed_info.feed['planet_http_location'] = href if id: feeds_seen[id] = uri if href: feeds_seen[href] = uri # complete processing for the feed writeCache(uri, feed_info, data) except Exception, e: import sys, traceback type, value, tb = sys.exc_info() log.error('Error processing %s', uri) for line in (traceback.format_exception_only(type, value) + traceback.format_tb(tb)): log.error(line.rstrip()) time.sleep(0.1) for index in threads.keys(): if not threads[index].isAlive(): del threads[index] if not threads: log.info("Finished threaded part of processing.")
def spiderPlanet(only_if_new = False): """ Spider (fetch) an entire planet """ log = planet.logger global index index = True timeout = config.feed_timeout() try: socket.setdefaulttimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: try: import timeoutsocket timeoutsocket.setDefaultSocketTimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: log.warning("Timeout set to invalid value '%s', skipping", timeout) from Queue import Queue from threading import Thread fetch_queue = Queue() parse_queue = Queue() threads = {} http_cache = config.http_cache_directory() # Should this be done in config? if http_cache and not os.path.exists(http_cache): os.makedirs(http_cache) if int(config.spider_threads()): # Start all the worker threads for i in range(int(config.spider_threads())): threads[i] = Thread(target=httpThread, args=(i,fetch_queue, parse_queue, log)) threads[i].start() else: log.info("Building work queue") # Load the fetch and parse work queues for uri in config.subscriptions(): # read cached feed info sources = config.cache_sources_directory() feed_source = filename(sources, uri) feed_info = feedparser.parse(feed_source) if feed_info.feed and only_if_new: log.info("Feed %s already in cache", uri) continue if feed_info.feed.get('planet_http_status',None) == '410': log.info("Feed %s gone", uri) continue if threads and _is_http_uri(uri): fetch_queue.put(item=(uri, feed_info)) else: parse_queue.put(item=(uri, feed_info, uri)) # Mark the end of the fetch queue for thread in threads.keys(): fetch_queue.put(item=(None, None)) # Process the results as they arrive feeds_seen = {} while fetch_queue.qsize() or parse_queue.qsize() or threads: while parse_queue.qsize(): (uri, feed_info, feed) = parse_queue.get(False) try: if not hasattr(feed,'headers') or int(feed.headers.status)<300: options = {} if hasattr(feed_info,'feed'): options['etag'] = \ feed_info.feed.get('planet_http_etag',None) try: modified=time.strptime( feed_info.feed.get('planet_http_last_modified', None)) except: pass data = feedparser.parse(feed, **options) else: data = feedparser.FeedParserDict({'version': None, 'headers': feed.headers, 'entries': [], 'feed': {}, 'href': feed.url, 'bozo': 0, 'status': int(feed.headers.status)}) # duplicate feed? id = data.feed.get('id', None) if not id: id = feed_info.feed.get('id', None) href=uri if data.has_key('href'): href=data.href duplicate = None if id and id in feeds_seen: duplicate = id elif href and href in feeds_seen: duplicate = href if duplicate: feed_info.feed['planet_message'] = \ 'duplicate subscription: ' + feeds_seen[duplicate] log.warn('Duplicate subscription: %s and %s' % (uri, feeds_seen[duplicate])) if href: feed_info.feed['planet_http_location'] = href if id: feeds_seen[id] = uri if href: feeds_seen[href] = uri # complete processing for the feed writeCache(uri, feed_info, data) except Exception, e: import sys, traceback type, value, tb = sys.exc_info() log.error('Error processing %s', uri) for line in (traceback.format_exception_only(type, value) + traceback.format_tb(tb)): log.error(line.rstrip()) time.sleep(0.1) for index in threads.keys(): if not threads[index].isAlive(): del threads[index] if not threads: log.info("Finished threaded part of processing.")
def httpThread(thread_index, input_queue, output_queue, log): import requests from cachecontrol import CacheControl from cachecontrol.caches.file_cache import FileCache cached_session = CacheControl(requests.session(), cache=FileCache( config.http_cache_directory())) uri, feed_info = input_queue.get(block=True) while uri: log.info("Fetching %s via %d", uri, thread_index) feed = StringIO('') setattr(feed, 'url', uri) setattr(feed, 'headers', feedparser.FeedParserDict({'status': '500'})) try: # map IRI => URI try: if isinstance(uri, unicode): idna = uri.encode('idna') else: idna = uri.decode('utf-8').encode('idna') if idna != uri: log.info("IRI %s mapped to %s", uri, idna) except: log.info("unable to map %s to a URI", uri) idna = uri # cache control headers headers = {} if feed_info.feed.has_key('planet_http_etag'): headers['If-None-Match'] = feed_info.feed['planet_http_etag'] if feed_info.feed.has_key('planet_http_last_modified'): headers['If-Modified-Since'] = \ feed_info.feed['planet_http_last_modified'] headers['User-Agent'] = 'venus' c_req = cached_session.get(idna, headers=headers, verify=True, timeout=30) content = c_req.content resp = fakeResponse(c_req) # unchanged detection resp['-content-hash'] = md5(content or '').hexdigest() if resp.status == 200: if resp.fromcache: resp.status = 304 elif feed_info.feed.has_key('planet_content_hash') and \ feed_info.feed['planet_content_hash'] == \ resp['-content-hash']: resp.status = 304 # build a file-like object feed = StringIO(content) setattr(feed, 'url', resp.get('content-location', uri)) if resp.has_key('content-encoding'): del resp['content-encoding'] setattr(feed, 'headers', resp) except requests.HTTPError, e: log.error("HTTP error when requesting %s: %s", uri, e) except requests.ConnectionError, e: log.error("Connection Error when requesting %s", uri)
def httpThread(thread_index, input_queue, output_queue, log): import httplib2 from httplib import BadStatusLine h = httplib2.Http(config.http_cache_directory()) uri, feed_info = input_queue.get(block=True) while uri: log.info("Fetching %s via %d", uri, thread_index) feed = StringIO('') setattr(feed, 'url', uri) setattr(feed, 'headers', feedparser.FeedParserDict({'status': '500'})) try: # map IRI => URI try: if isinstance(uri, unicode): idna = uri.encode('idna') else: idna = uri.decode('utf-8').encode('idna') if idna != uri: log.info("IRI %s mapped to %s", uri, idna) except: log.info("unable to map %s to a URI", uri) idna = uri # cache control headers headers = {} if feed_info.feed.has_key('planet_http_etag'): headers['If-None-Match'] = feed_info.feed['planet_http_etag'] if feed_info.feed.has_key('planet_http_last_modified'): headers['If-Modified-Since'] = \ feed_info.feed['planet_http_last_modified'] headers[ 'user-agent'] = 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0' # issue request (resp, content) = h.request(idna, 'GET', headers=headers) # unchanged detection resp['-content-hash'] = md5(content or '').hexdigest() if resp.status == 200: if resp.fromcache: resp.status = 304 elif feed_info.feed.has_key('planet_content_hash') and \ feed_info.feed['planet_content_hash'] == \ resp['-content-hash']: resp.status = 304 # build a file-like object feed = StringIO(content) setattr(feed, 'url', resp.get('content-location', uri)) if resp.has_key('content-encoding'): del resp['content-encoding'] setattr(feed, 'headers', resp) except BadStatusLine: log.error("Bad Status Line received for %s via %d", uri, thread_index) except httplib2.HttpLib2Error, e: log.error("HttpLib2Error: %s via %d", str(e), thread_index) except socket.error, e: if e.__class__.__name__.lower() == 'timeout': feed.headers['status'] = '408' log.warn("Timeout in thread-%d", thread_index) else: log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
def httpThread(thread_index, input_queue, output_queue, log): import httplib2, md5 from socket import gaierror, error from httplib import BadStatusLine h = httplib2.Http(config.http_cache_directory()) uri, feed_info = input_queue.get(block=True) while uri: log.info("Fetching %s via %d", uri, thread_index) feed = StringIO('') setattr(feed, 'url', uri) setattr(feed, 'headers', feedparser.FeedParserDict({'status':'500'})) try: # map IRI => URI try: if isinstance(uri,unicode): idna = uri.encode('idna') else: idna = uri.decode('utf-8').encode('idna') if idna != uri: log.info("IRI %s mapped to %s", uri, idna) except: log.info("unable to map %s to a URI", uri) idna = uri # cache control headers headers = {} if feed_info.feed.has_key('planet_http_etag'): headers['If-None-Match'] = feed_info.feed['planet_http_etag'] if feed_info.feed.has_key('planet_http_last_modified'): headers['If-Modified-Since'] = \ feed_info.feed['planet_http_last_modified'] # issue request (resp, content) = h.request(idna, 'GET', headers=headers) # unchanged detection resp['-content-hash'] = md5.new(content or '').hexdigest() if resp.status == 200: if resp.fromcache: resp.status = 304 elif feed_info.feed.has_key('planet_content_hash') and \ feed_info.feed['planet_content_hash'] == \ resp['-content-hash']: resp.status = 304 # build a file-like object feed = StringIO(content) setattr(feed, 'url', resp.get('content-location', uri)) if resp.has_key('content-encoding'): del resp['content-encoding'] setattr(feed, 'headers', resp) except gaierror: log.error("Fail to resolve server name %s via %d", uri, thread_index) except BadStatusLine: log.error("Bad Status Line received for %s via %d", uri, thread_index) except error, e: if e.__class__.__name__.lower()=='timeout': feed.headers['status'] = '408' log.warn("Timeout in thread-%d", thread_index) else: log.error("HTTP Error: %s in thread-%d", str(e), thread_index) except Exception, e: import sys, traceback type, value, tb = sys.exc_info() log.error('Error processing %s', uri) for line in (traceback.format_exception_only(type, value) + traceback.format_tb(tb)): log.error(line.rstrip()) continue
def httpThread(thread_index, input_queue, output_queue, log): import httplib2 from httplib import BadStatusLine h = httplib2.Http(config.http_cache_directory()) uri, feed_info = input_queue.get(block=True) while uri: log.info("Fetching %s via %d", uri, thread_index) feed = StringIO("") setattr(feed, "url", uri) setattr(feed, "headers", feedparser.FeedParserDict({"status": "500"})) try: # map IRI => URI try: if isinstance(uri, unicode): idna = uri.encode("idna") else: idna = uri.decode("utf-8").encode("idna") if idna != uri: log.info("IRI %s mapped to %s", uri, idna) except: log.info("unable to map %s to a URI", uri) idna = uri # cache control headers headers = {} if feed_info.feed.has_key("planet_http_etag"): headers["If-None-Match"] = feed_info.feed["planet_http_etag"] if feed_info.feed.has_key("planet_http_last_modified"): headers["If-Modified-Since"] = feed_info.feed["planet_http_last_modified"] # issue request (resp, content) = h.request(idna, "GET", headers=headers) # unchanged detection resp["-content-hash"] = md5(content or "").hexdigest() if resp.status == 200: if resp.fromcache: resp.status = 304 elif ( feed_info.feed.has_key("planet_content_hash") and feed_info.feed["planet_content_hash"] == resp["-content-hash"] ): resp.status = 304 # build a file-like object feed = StringIO(content) setattr(feed, "url", resp.get("content-location", uri)) if resp.has_key("content-encoding"): del resp["content-encoding"] setattr(feed, "headers", resp) except BadStatusLine: log.error("Bad Status Line received for %s via %d", uri, thread_index) except httplib2.HttpLib2Error, e: log.error("HttpLib2Error: %s via %d", str(e), thread_index) except socket.error, e: if e.__class__.__name__.lower() == "timeout": feed.headers["status"] = "408" log.warn("Timeout in thread-%d", thread_index) else: log.error("HTTP Error: %s in thread-%d", str(e), thread_index)