def httpThread(thread_index, input_queue, output_queue, log): import httplib2 from httplib import BadStatusLine h = httplib2.Http(config.http_cache_directory()) uri, feed_info = input_queue.get(block=True) while uri: log.info("Fetching %s via %d", uri, thread_index) feed = StringIO('') setattr(feed, 'url', uri) setattr(feed, 'headers', feedparser.FeedParserDict({'status':'500'})) try: # map IRI => URI try: if isinstance(uri,unicode): idna = uri.encode('idna') else: idna = uri.decode('utf-8').encode('idna') if idna != uri: log.info("IRI %s mapped to %s", uri, idna) except: log.info("unable to map %s to a URI", uri) idna = uri # cache control headers headers = {} if feed_info.feed.has_key('planet_http_etag'): headers['If-None-Match'] = feed_info.feed['planet_http_etag'] if feed_info.feed.has_key('planet_http_last_modified'): headers['If-Modified-Since'] = \ feed_info.feed['planet_http_last_modified'] headers["user-agent"] = "Venus (+%s)" % config.link() # issue request (resp, content) = h.request(idna, 'GET', headers=headers) # unchanged detection resp['-content-hash'] = md5(content or '').hexdigest() if resp.status == 200: if resp.fromcache: resp.status = 304 elif feed_info.feed.has_key('planet_content_hash') and \ feed_info.feed['planet_content_hash'] == \ resp['-content-hash']: resp.status = 304 # build a file-like object feed = StringIO(content) setattr(feed, 'url', resp.get('content-location', uri)) if resp.has_key('content-encoding'): del resp['content-encoding'] setattr(feed, 'headers', resp) except BadStatusLine: log.error("Bad Status Line received for %s via %d", uri, thread_index) except httplib2.HttpLib2Error, e: log.error("HttpLib2Error: %s via %d", str(e), thread_index) except socket.gaierror, e: log.error("socket.gaierror: %s - %s (thread %d)", uri, str(e[1]), thread_index)
def httpThread(thread_index, input_queue, output_queue, log): import httplib2 from httplib import BadStatusLine h = httplib2.Http(config.http_cache_directory()) uri, feed_info = input_queue.get(block=True) while uri: log.info("Fetching %s via %d", uri, thread_index) feed = StringIO('') setattr(feed, 'url', uri) setattr(feed, 'headers', feedparser.FeedParserDict({'status': '500'})) try: # map IRI => URI try: if isinstance(uri, unicode): idna = uri.encode('idna') else: idna = uri.decode('utf-8').encode('idna') if idna != uri: log.info("IRI %s mapped to %s", uri, idna) except: log.info("unable to map %s to a URI", uri) idna = uri # cache control headers headers = {} if feed_info.feed.has_key('planet_http_etag'): headers['If-None-Match'] = feed_info.feed['planet_http_etag'] if feed_info.feed.has_key('planet_http_last_modified'): headers['If-Modified-Since'] = \ feed_info.feed['planet_http_last_modified'] headers["user-agent"] = "Venus (+%s)" % config.link() # issue request (resp, content) = h.request(idna, 'GET', headers=headers) # unchanged detection resp['-content-hash'] = md5(content or '').hexdigest() if resp.status == 200: if resp.fromcache: resp.status = 304 elif feed_info.feed.has_key('planet_content_hash') and \ feed_info.feed['planet_content_hash'] == \ resp['-content-hash']: resp.status = 304 # build a file-like object feed = StringIO(content) setattr(feed, 'url', resp.get('content-location', uri)) if resp.has_key('content-encoding'): del resp['content-encoding'] setattr(feed, 'headers', resp) except BadStatusLine: log.error("Bad Status Line received for %s via %d", uri, thread_index) except httplib2.HttpLib2Error, e: log.error("HttpLib2Error: %s via %d", str(e), thread_index) except socket.gaierror, e: log.error("socket.gaierror: %s - %s (thread %d)", uri, str(e[1]), thread_index)
def splice(): """ Splice together a planet from a cache of entries """ import planet log = planet.logger log.info("Loading cached data") cache = config.cache_directory() dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*") if not os.path.isdir(file)] dir.sort() dir.reverse() max_items = max([ config.items_per_page(templ) for templ in config.template_files() or ['Planet'] ]) doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>') feed = doc.documentElement # insert feed information createTextElement(feed, 'title', config.name()) date(feed, 'updated', time.gmtime()) gen = createTextElement(feed, 'generator', config.generator()) gen.setAttribute('uri', config.generator_uri()) author = doc.createElement('author') createTextElement(author, 'name', config.owner_name()) createTextElement(author, 'email', config.owner_email()) feed.appendChild(author) if config.feed(): createTextElement(feed, 'id', config.feed()) link = doc.createElement('link') link.setAttribute('rel', 'self') link.setAttribute('href', config.feed()) if config.feedtype(): link.setAttribute('type', "application/%s+xml" % config.feedtype()) feed.appendChild(link) if config.link(): link = doc.createElement('link') link.setAttribute('rel', 'alternate') link.setAttribute('href', config.link()) feed.appendChild(link) # insert subscription information sub_ids = [] feed.setAttribute('xmlns:planet', planet.xmlns) sources = config.cache_sources_directory() for sub in config.subscriptions(): data = feedparser.parse(filename(sources, sub)) if data.feed.has_key('id'): sub_ids.append(data.feed.id) if not data.feed: continue xdoc = minidom.parseString('''<planet:source xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) index = idindex.open() # insert entry information items = 0 count = {} new_feed_items = config.new_feed_items() for mtime, file in dir: if index != None: base = os.path.basename(file) if index.has_key(base) and index[base] not in sub_ids: continue try: entry = minidom.parse(file) # verify that this entry is currently subscribed to and that the # number of entries contributed by this feed does not exceed # config.new_feed_items entry.normalize() sources = entry.getElementsByTagName('source') if sources: ids = sources[0].getElementsByTagName('id') if ids: id = ids[0].childNodes[0].nodeValue count[id] = count.get(id, 0) + 1 if new_feed_items and count[id] > new_feed_items: continue if id not in sub_ids: ids = sources[0].getElementsByTagName('planet:id') if not ids: continue id = ids[0].childNodes[0].nodeValue if id not in sub_ids: continue # add entry to feed feed.appendChild(entry.documentElement) items = items + 1 if items >= max_items: break except: log.error("Error parsing %s", file) if index: index.close() return doc
def splice(): """ Splice together a planet from a cache of entries """ import planet log = planet.logger log.info("Loading cached data") cache = config.cache_directory() dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*") if not os.path.isdir(file)] dir.sort() dir.reverse() max_items=max([config.items_per_page(templ) for templ in config.template_files() or ['Planet']]) doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>') feed = doc.documentElement # insert feed information createTextElement(feed, 'title', config.name()) date(feed, 'updated', time.gmtime()) gen = createTextElement(feed, 'generator', config.generator()) gen.setAttribute('uri', config.generator_uri()) author = doc.createElement('author') createTextElement(author, 'name', config.owner_name()) createTextElement(author, 'email', config.owner_email()) feed.appendChild(author) if config.feed(): createTextElement(feed, 'id', config.feed()) link = doc.createElement('link') link.setAttribute('rel', 'self') link.setAttribute('href', config.feed()) if config.feedtype(): link.setAttribute('type', "application/%s+xml" % config.feedtype()) feed.appendChild(link) if config.pubsubhubbub_hub(): hub = doc.createElement('link') hub.setAttribute('rel', 'hub') hub.setAttribute('href', config.pubsubhubbub_hub()) feed.appendChild(hub) if config.link(): link = doc.createElement('link') link.setAttribute('rel', 'alternate') link.setAttribute('href', config.link()) feed.appendChild(link) # insert subscription information sub_ids = [] feed.setAttribute('xmlns:planet',planet.xmlns) sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) if data.feed.has_key('id'): sub_ids.append(data.feed.id) if not data.feed: continue # warn on missing links if not data.feed.has_key('planet_message'): if not data.feed.has_key('links'): data.feed['links'] = [] for link in data.feed.links: if link.rel == 'self': break else: log.debug('missing self link for ' + sub) for link in data.feed.links: if link.rel == 'alternate' and 'html' in link.type: break else: log.debug('missing html link for ' + sub) xdoc=minidom.parseString('''<planet:source xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) index = idindex.open() # insert entry information items = 0 count = {} atomNS='http://www.w3.org/2005/Atom' new_feed_items = config.new_feed_items() for mtime,file in dir: if index != None: base = os.path.basename(file) if index.has_key(base) and index[base] not in sub_ids: continue try: entry=minidom.parse(file) # verify that this entry is currently subscribed to and that the # number of entries contributed by this feed does not exceed # config.new_feed_items entry.normalize() sources = entry.getElementsByTagNameNS(atomNS, 'source') if sources: ids = sources[0].getElementsByTagName('id') if ids: id = ids[0].childNodes[0].nodeValue count[id] = count.get(id,0) + 1 if new_feed_items and count[id] > new_feed_items: continue if id not in sub_ids: ids = sources[0].getElementsByTagName('planet:id') if not ids: continue id = ids[0].childNodes[0].nodeValue if id not in sub_ids: log.warn('Skipping: ' + id) if id not in sub_ids: continue # add entry to feed feed.appendChild(entry.documentElement) items = items + 1 if items >= max_items: break except: log.error("Error parsing %s", file) if index: index.close() return doc
def splice(): """ Splice together a planet from a cache of entries """ import planet log = planet.logger log.info("Loading cached data") cache = config.cache_directory() dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*") if not os.path.isdir(file)] dir.sort() dir.reverse() max_items = max([ config.items_per_page(templ) for templ in config.template_files() or ['Planet'] ]) doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>') feed = doc.documentElement # insert feed information createTextElement(feed, 'title', config.name()) date(feed, 'updated', time.gmtime()) gen = createTextElement(feed, 'generator', config.generator()) gen.setAttribute('uri', config.generator_uri()) author = doc.createElement('author') createTextElement(author, 'name', config.owner_name()) createTextElement(author, 'email', config.owner_email()) feed.appendChild(author) if config.feed(): createTextElement(feed, 'id', config.feed()) link = doc.createElement('link') link.setAttribute('rel', 'self') link.setAttribute('href', config.feed()) if config.feedtype(): link.setAttribute('type', "application/%s+xml" % config.feedtype()) feed.appendChild(link) if config.pubsubhubbub_hub(): hub = doc.createElement('link') hub.setAttribute('rel', 'hub') hub.setAttribute('href', config.pubsubhubbub_hub()) feed.appendChild(hub) if config.link(): link = doc.createElement('link') link.setAttribute('rel', 'alternate') link.setAttribute('href', config.link()) feed.appendChild(link) # insert subscription information sub_ids = [] feed.setAttribute('xmlns:planet', planet.xmlns) sources = config.cache_sources_directory() for sub in config.subscriptions(): data = feedparser.parse(filename(sources, sub)) if data.feed.has_key('id'): sub_ids.append(data.feed.id) if not data.feed: continue # warn on missing links if not data.feed.has_key('planet_message'): if not data.feed.has_key('links'): data.feed['links'] = [] for link in data.feed.links: if link.rel == 'self': break else: log.debug('missing self link for ' + sub) for link in data.feed.links: if link.rel == 'alternate' and 'html' in link.type: break else: log.debug('missing html link for ' + sub) xdoc = minidom.parseString('''<planet:source xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) index = idindex.open() # insert entry information items = 0 count = {} atomNS = 'http://www.w3.org/2005/Atom' new_feed_items = config.new_feed_items() posted_urls = set() if config.post_to_twitter(): if os.path.exists(posted_urls_file): try: with open(posted_urls_file, 'rb') as f: posted_urls = pickle.load(f) except Exception as ex: log.error("Error reading posted_urls %s", ex) # print(posted_urls) for mtime, file in dir: if index != None: base = os.path.basename(file) if index.has_key(base) and index[base] not in sub_ids: continue try: entry = minidom.parse(file) # verify that this entry is currently subscribed to and that the # number of entries contributed by this feed does not exceed # config.new_feed_items entry.normalize() sources = entry.getElementsByTagNameNS(atomNS, 'source') if sources: ids = sources[0].getElementsByTagName('id') if ids: id = ids[0].childNodes[0].nodeValue count[id] = count.get(id, 0) + 1 if new_feed_items and count[id] > new_feed_items: continue if id not in sub_ids: ids = sources[0].getElementsByTagName('planet:id') if not ids: continue id = ids[0].childNodes[0].nodeValue if id not in sub_ids: log.warn('Skipping: ' + id) if id not in sub_ids: continue # Twitter integration if config.post_to_twitter(): url = None twitter = None title = "Untitled post..." links = entry.getElementsByTagName('link') if links: for link in links: if link.hasAttribute('rel') and link.hasAttribute( 'type') and link.hasAttribute('href'): if (link.getAttribute('rel') == 'alternate' and link.getAttribute('type') == 'text/html'): url = link.getAttribute('href') break titles = entry.getElementsByTagName('title') if titles: title = unicode( titles[0].firstChild.nodeValue.encode('utf-8'), 'utf-8').strip() handles = entry.getElementsByTagName('planet:twitter') if (handles): twitter = unicode( handles[0].firstChild.nodeValue.encode('utf-8'), "utf-8") if url is not None and url not in posted_urls: # log.debug("Going to post URL to Twitter: twitter='{}' title='{}', url='{}'".format(twitter, title, url)) txt_append = u'' if twitter: txt_append = u" (by @" + twitter.encode( 'utf-8').strip() + u")" max_title_len = 280 - 20 - len(txt_append) if (len(title) > max_title_len): title = title[:max_title_len] txt = title + txt_append + u"\n" + url log.debug(u"Text to post '{}'".format(txt)) try: posted_urls.add(url) config.twitter_api.update_status(txt) except Exception as ex: log.error(u"Error posting to Twitter: %s", ex) # add entry to feed feed.appendChild(entry.documentElement) items = items + 1 if items >= max_items: break except Exception as ex: log.error("Error parsing %s: %s", file, ex) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) if config.post_to_twitter(): with open(posted_urls_file, 'wb') as f: pickle.dump(posted_urls, f, protocol=pickle.HIGHEST_PROTOCOL) if index: index.close() return doc
def splice(): """ Splice together a planet from a cache of entries """ import planet log = planet.getLogger(config.log_level(),config.log_format()) log.info("Loading cached data") cache = config.cache_directory() dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*") if not os.path.isdir(file)] dir.sort() dir.reverse() max_items=max([config.items_per_page(templ) for templ in config.template_files() or ['Planet']]) doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>') feed = doc.documentElement # insert feed information createTextElement(feed, 'title', config.name()) date(feed, 'updated', time.gmtime()) gen = createTextElement(feed, 'generator', config.generator()) gen.setAttribute('uri', config.generator_uri()) author = doc.createElement('author') createTextElement(author, 'name', config.owner_name()) createTextElement(author, 'email', config.owner_email()) feed.appendChild(author) if config.feed(): createTextElement(feed, 'id', config.feed()) link = doc.createElement('link') link.setAttribute('rel', 'self') link.setAttribute('href', config.feed()) if config.feedtype(): link.setAttribute('type', "application/%s+xml" % config.feedtype()) feed.appendChild(link) if config.link(): link = doc.createElement('link') link.setAttribute('rel', 'alternate') link.setAttribute('href', config.link()) feed.appendChild(link) # insert subscription information sub_ids = [] feed.setAttribute('xmlns:planet',planet.xmlns) sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) if data.feed.has_key('id'): sub_ids.append(data.feed.id) if not data.feed: continue xdoc=minidom.parseString('''<planet:source xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) index = idindex.open() # insert entry information items = 0 for mtime,file in dir: if index != None: base = os.path.basename(file) if index.has_key(base) and index[base] not in sub_ids: continue try: entry=minidom.parse(file) # verify that this entry is currently subscribed to entry.normalize() sources = entry.getElementsByTagName('source') if sources: ids = sources[0].getElementsByTagName('id') if ids and ids[0].childNodes[0].nodeValue not in sub_ids: ids = sources[0].getElementsByTagName('planet:id') if not ids: continue if ids[0].childNodes[0].nodeValue not in sub_ids: continue # add entry to feed feed.appendChild(entry.documentElement) items = items + 1 if items >= max_items: break except: log.error("Error parsing %s", file) if index: index.close() return doc