def get_entry_title(entry): if 'title' in entry: return html.strip_html(entry.title) return 'Untitled'
def get_entry_title(entry): if 'title' in entry: return truncate(html.strip_html(entry.title), MAX_TITLE_LENGTH) return 'Untitled'
def fetch_feed(feed, add_entries=False): def post_fetch(status, error=False): if status: feed.last_status = status if error: feed.error_count = feed.error_count + 1 error_threshold = config.getint('fetcher', 'error_threshold') if error_threshold and (feed.error_count > error_threshold): feed.is_enabled = False feed.last_status = ProblematicFeedError.code log.warn("%s has too many errors, disabled" % netloc) feed.save() log.debug("fetching %s" % feed.self_link) schema, netloc, path, params, query, fragment = urlparse.urlparse( feed.self_link) now = datetime.utcnow() request_headers = {'User-Agent': user_agent} interval = config.getint('fetcher', 'min_interval') # Check freshness for fieldname in ['last_checked_on', 'last_updated_on']: value = getattr(feed, fieldname) if not value: continue # No datetime.timedelta since we need to deal with large seconds values delta = datetime_as_epoch(now) - datetime_as_epoch(value) if delta < interval: log.debug("%s for %s is below min_interval, skipped" % (fieldname, netloc)) return # Conditional GET headers if feed.etag and feed.last_updated_on: request_headers['If-None-Match'] = feed.etag request_headers['If-Modified-Since'] = format_http_datetime( feed.last_updated_on) timeout = config.getint('fetcher', 'timeout') try: response = requests.get(feed.self_link, timeout=timeout, headers=request_headers) except (IOError, RequestException): # Interpret as 'Service Unavailable' post_fetch(503, error=True) log.warn("a network error occured while fetching %s, skipped" % netloc) return feed.last_checked_on = now if response.history and response.history[ 0].status_code == 301: # Moved permanently self_link = response.url try: Feed.get(self_link=self_link) except Feed.DoesNotExist: feed.self_link = self_link log.info("%s has changed its location, updated to %s" % (netloc, self_link)) else: feed.is_enabled = False log.warn("new %s location %s is duplicated, disabled" % (netloc, self_link)) post_fetch(DuplicatedFeedError.code) return if response.status_code == 304: # Not modified log.debug("%s hasn't been modified, skipped" % netloc) post_fetch(response.status_code) return elif response.status_code == 410: # Gone log.warn("%s is gone, disabled" % netloc) feed.is_enabled = False post_fetch(response.status_code) return elif response.status_code not in (200, ): # No good log.warn("%s replied with status %d, aborted" % (netloc, response.status_code)) post_fetch(response.status_code, error=True) return soup = feedparser.parse(response.text) # Got parsing error? Log error but do not increment the error counter if hasattr(soup, 'bozo') and soup.bozo: log.info("%s caused a parser error (%s), tried to parse it anyway" % (netloc, soup.bozo_exception)) post_fetch(response.status_code, error=False) feed.etag = response.headers.get('ETag', None) if 'link' in soup.feed: feed.alternate_link = soup.feed.link # Reset value only if not set before if ('title' in soup.feed) and not feed.title: feed.title = html.strip_html(soup.feed.title) feed.last_updated_on = get_feed_timestamp(soup.feed, now) post_fetch(response.status_code) if not add_entries: return for entry in soup.entries: link = get_entry_link(entry) guid = get_entry_id(entry, default=link) if not guid: log.warn('could not find guid for entry from %s, skipped' % netloc) continue title = get_entry_title(entry) timestamp = get_entry_timestamp(entry, default=now) author = get_entry_author(entry, soup.feed) # Skip ancient feed items max_history = config.getint('fetcher', 'max_history') if max_history and ((now - timestamp).days > max_history): log.debug("entry %s from %s is over max_history, skipped" % (guid, netloc)) continue try: # If entry is already in database with same id, then skip it Entry.get(guid=guid) log.debug("duplicated entry %s, skipped" % guid) continue except Entry.DoesNotExist: pass mime_type, content = get_entry_content(entry) if blacklist and ('html' in mime_type): content = html.scrub_html(content, blacklist) d = { 'guid': guid, 'feed': feed, 'title': title, 'author': author, 'content': content, 'link': link, 'last_updated_on': timestamp, } # Save to database Entry.create(**d) log.debug(u"added entry %s from %s" % (guid, netloc))
def fetch_feed(feed, add_entries=False): def post_fetch(status, error=False): if status: feed.last_status = status if error: feed.error_count = feed.error_count + 1 error_threshold = config.getint('fetcher', 'error_threshold') if error_threshold and (feed.error_count > error_threshold): feed.is_enabled = False feed.last_status = status # Save status code for posterity log.warn("%s has too many errors, disabled" % netloc) feed.save() if hasattr(feed, 'subscriptions') and not feed.subscriptions: log.debug("feed %s has no subscribers, skipped" % feed.self_link) return log.debug("fetching %s" % feed.self_link) schema, netloc, path, params, query, fragment = urlparse.urlparse(feed.self_link) now = datetime.utcnow() request_headers = { 'User-Agent': user_agent } interval = config.getint('fetcher', 'min_interval') # Check freshness for fieldname in ['last_checked_on', 'last_updated_on']: value = getattr(feed, fieldname) if not value: continue # No datetime.timedelta since we need to deal with large seconds values delta = datetime_as_epoch(now) - datetime_as_epoch(value) if delta < interval: log.debug("%s for %s is below min_interval, skipped" % (fieldname, netloc)) return # Conditional GET headers if feed.etag and feed.last_updated_on: request_headers['If-None-Match'] = feed.etag request_headers['If-Modified-Since'] = format_http_datetime(feed.last_updated_on) timeout = config.getint('fetcher', 'timeout') try: response = requests.get(feed.self_link, timeout=timeout, headers=request_headers) except (IOError, RequestException): # Interpret as 'Service Unavailable' #@@FIXME: catch ContentDecodingError? post_fetch(503, error=True) log.warn("a network error occured while fetching %s, skipped" % netloc) return feed.last_checked_on = now if response.history and response.history[0].status_code == 301: # Moved permanently self_link = response.url try: Feed.get(self_link=self_link) except Feed.DoesNotExist: feed.self_link = self_link log.info("%s has changed its location, updated to %s" % (netloc, self_link)) else: feed.is_enabled = False log.warn("new %s location %s is duplicated, disabled" % (netloc, self_link)) post_fetch(DuplicatedFeedError.code) return if response.status_code == 304: # Not modified log.debug("%s hasn't been modified, skipped" % netloc) post_fetch(response.status_code) return elif response.status_code == 410: # Gone log.warn("%s is gone, disabled" % netloc) feed.is_enabled = False post_fetch(response.status_code) return elif response.status_code not in (200, ): # No good log.warn("%s replied with status %d, aborted" % (netloc, response.status_code)) post_fetch(response.status_code, error=True) return soup = feedparser.parse(response.text) # Got parsing error? Log error but do not increment the error counter if hasattr(soup, 'bozo') and soup.bozo: log.info("%s caused a parser error (%s), tried to parse it anyway" % (netloc, soup.bozo_exception)) post_fetch(response.status_code, error=False) feed.etag = response.headers.get('ETag', None) if 'link' in soup.feed: feed.alternate_link = soup.feed.link # Reset value only if not set before if ('title' in soup.feed) and not feed.title: feed.title = html.strip_html(soup.feed.title) feed.last_updated_on = get_feed_timestamp(soup.feed, now) post_fetch(response.status_code) if not add_entries: return for entry in soup.entries: link = get_entry_link(entry) guid = get_entry_id(entry, default=link) if not guid: log.warn('could not find guid for entry from %s, skipped' % netloc) continue title = get_entry_title(entry) timestamp = get_entry_timestamp(entry, default=now) author = get_entry_author(entry, soup.feed) # Skip ancient feed items max_history = config.getint('fetcher', 'max_history') if max_history and ((now - timestamp).days > max_history): log.debug("entry %s from %s is over max_history, skipped" % (guid, netloc)) continue try: # If entry is already in database with same id, then skip it Entry.get(guid=guid) log.debug("duplicated entry %s, skipped" % guid) continue except Entry.DoesNotExist: pass mime_type, content = get_entry_content(entry) if blacklist and ('html' in mime_type): content = html.scrub_html(content, blacklist) d = { 'guid' : guid, 'feed' : feed, 'title' : title, 'author' : author, 'content' : content, 'link' : link, 'last_updated_on' : timestamp, } # Save to database Entry.create(**d) log.debug(u"added entry %s from %s" % (guid, netloc))
def fetch_feed(feed, add_entries=False): def synthesize_entry(reason): title, content = u'This feed has been disabled', render_template(os.path.join(template_dir, '_entry_feed_disabled.html'), {'reason': reason}) return add_synthesized_entry(feed, title, content) def post_fetch(status, error=False): if status: feed.last_status = status if error: feed.error_count = feed.error_count + 1 error_threshold = config.getint('fetcher', 'error_threshold') if error_threshold and (feed.error_count > error_threshold): feed.is_enabled = False feed.last_status = status # Save status code for posterity logger.warn("%s has too many errors, disabled" % netloc) synthesize_entry('Feed has accomulated too many errors (last was %s).' % status_title(status)) feed.save() logger.debug("fetching %s" % feed.self_link) schema, netloc, path, params, query, fragment = urlparse.urlparse(feed.self_link) now = datetime.utcnow() interval = config.getint('fetcher', 'min_interval') # Check freshness for fieldname in ['last_checked_on', 'last_updated_on']: value = getattr(feed, fieldname) if not value: continue # No datetime.timedelta since we need to deal with large seconds values delta = datetime_as_epoch(now) - datetime_as_epoch(value) if delta < interval: logger.debug("%s for %s is below min_interval, skipped" % (fieldname, netloc)) return response = fetch_url(feed.self_link, etag=feed.etag, modified_since=feed.last_updated_on) if not response: # Record as "503 Service unavailable" post_fetch(503, error=True) logger.warn("a network error occured while fetching %s" % netloc) return feed.last_checked_on = now if response.history and response.history[0].status_code == 301: # Moved permanently self_link = response.url try: Feed.get(self_link=self_link) except Feed.DoesNotExist: feed.self_link = self_link logger.info("%s has changed its location, updated to %s" % (netloc, self_link)) else: feed.is_enabled = False logger.warn("new %s location %s is duplicated, disabled" % (netloc, self_link)) synthesize_entry('Feed has a duplicated web address.') post_fetch(DuplicatedFeedError.code) return if response.status_code == 304: # Not modified logger.debug("%s hasn't been modified, skipped" % netloc) post_fetch(response.status_code) return elif response.status_code == 410: # Gone feed.is_enabled = False logger.warn("%s is gone, disabled" % netloc) synthesize_entry('Feed has been removed from the origin server.') post_fetch(response.status_code) return elif response.status_code not in POSITIVE_STATUS_CODES: # No good logger.warn("%s replied with status %d, aborted" % (netloc, response.status_code)) post_fetch(response.status_code, error=True) return soup = feedparser.parse(response.text) # Got parsing error? Log error but do not increment the error counter if hasattr(soup, 'bozo') and soup.bozo: logger.info("%s caused a parser error (%s), tried to parse it anyway" % (netloc, soup.bozo_exception)) post_fetch(response.status_code) feed.etag = response.headers.get('ETag', None) if 'link' in soup.feed: feed.alternate_link = soup.feed.link # Reset value only if not set before if ('title' in soup.feed) and not feed.title: feed.title = html.strip_html(soup.feed.title) feed.last_updated_on = get_feed_timestamp(soup.feed, now) post_fetch(response.status_code) if not add_entries: return for parsed_entry in soup.entries: link = get_entry_link(parsed_entry) guid = get_entry_id(parsed_entry, default=link) if not guid: logger.warn('could not find guid for entry from %s, skipped' % netloc) continue title = get_entry_title(parsed_entry) mime_type, content = get_entry_content(parsed_entry) timestamp = get_entry_timestamp(parsed_entry, default=now) author = get_entry_author(parsed_entry, soup.feed) # Skip ancient feed items max_history = config.getint('fetcher', 'max_history') if max_history and ((now - timestamp).days > max_history): logger.debug("entry %s from %s is over max_history, skipped" % (guid, netloc)) continue try: # If entry is already in database with same id, then skip it Entry.get(guid=guid) logger.debug("duplicated entry %s, skipped" % guid) continue except Entry.DoesNotExist: pass entry = Entry( guid = guid, feed = feed, title = title, author = author, content = content, #@@TODO: add mime_type too link = link, last_updated_on = timestamp ) trigger_event('entry_parsed', entry, parsed_entry) entry.save() logger.debug(u"added entry %s from %s" % (guid, netloc))
def get_entry_title(entry, default): if 'title' in entry: return truncate(html.strip_html(entry.title), MAX_TITLE_LENGTH) return default
def fetch_feed(feed, add_entries=False): def synthesize_entry(reason): title, content = u'This feed has been disabled', render_template( os.path.join(template_dir, '_entry_feed_disabled.html'), {'reason': reason}) return add_synthesized_entry(feed, title, 'text/html', content) def post_fetch(status, error=False): if status: feed.last_status = status if error: feed.error_count = feed.error_count + 1 error_threshold = config.getint('fetcher', 'error_threshold') if error_threshold and (feed.error_count > error_threshold): feed.is_enabled = False feed.last_status = status # Save status code for posterity logger.warn(u"%s has too many errors, disabled" % netloc) synthesize_entry( 'Feed has accomulated too many errors (last was %s).' % status_title(status)) feed.save() max_history = config.getint('fetcher', 'max_history') interval = config.getint('fetcher', 'min_interval') timeout = config.getint('fetcher', 'timeout') logger.debug(u"fetching %s" % feed.self_link) schema, netloc, path, params, query, fragment = urlparse.urlparse( feed.self_link) now = datetime.utcnow() # Check freshness for fieldname in ['last_checked_on', 'last_updated_on']: value = getattr(feed, fieldname) if not value: continue # No datetime.timedelta since we need to deal with large seconds values delta = datetime_as_epoch(now) - datetime_as_epoch(value) if delta < interval: logger.debug(u"%s for %s is below min_interval, skipped" % (fieldname, netloc)) return response = fetch_url(feed.self_link, timeout=timeout, etag=feed.etag, modified_since=feed.last_updated_on) if not response: # Record as "503 Service unavailable" post_fetch(503, error=True) logger.warn(u"a network error occured while fetching %s" % netloc) return feed.last_checked_on = now if response.history and response.history[ 0].status_code == 301: # Moved permanently self_link = response.url try: Feed.get(self_link=self_link) except Feed.DoesNotExist: feed.self_link = self_link logger.info(u"%s has changed its location, updated to %s" % (netloc, self_link)) else: feed.is_enabled = False logger.warn(u"new %s location %s is duplicated, disabled" % (netloc, self_link)) synthesize_entry('Feed has a duplicated web address.') post_fetch(DuplicatedFeedError.code, error=True) return if response.status_code == 304: # Not modified logger.debug(u"%s hasn't been modified, skipped" % netloc) post_fetch(response.status_code) return elif response.status_code == 410: # Gone feed.is_enabled = False logger.warn(u"%s is gone, disabled" % netloc) synthesize_entry('Feed has been removed from the origin server.') post_fetch(response.status_code, error=True) return elif response.status_code not in POSITIVE_STATUS_CODES: # No good logger.warn(u"%s replied with status %d, aborted" % (netloc, response.status_code)) post_fetch(response.status_code, error=True) return soup = feedparser.parse(response.text) # Got parsing error? Log error but do not increment the error counter if hasattr(soup, 'bozo') and soup.bozo: logger.info( u"%s caused a parser error (%s), tried to parse it anyway" % (netloc, soup.bozo_exception)) post_fetch(response.status_code) feed.etag = response.headers.get('ETag', None) if 'link' in soup.feed: feed.alternate_link = soup.feed.link # Reset value only if not set before if ('title' in soup.feed) and not feed.title: feed.title = html.strip_html(soup.feed.title) feed.last_updated_on = get_feed_timestamp(soup.feed, now) if not feed.icon or not feed.icon_last_updated_on or ( now - feed.icon_last_updated_on).days > FETCH_ICONS_DELTA: # Prefer alternate_link if available since self_link could # point to Feed Burner or similar services feed.icon = favicon.fetch(feed.alternate_link or feed.self_link) feed.icon_last_updated_on = now logger.debug(u"saved favicon %s..." % (feed.icon[:70])) post_fetch(response.status_code) if not add_entries: return for parsed_entry in soup.entries: link = get_entry_link(parsed_entry) guid = get_entry_id(parsed_entry, default=link) if not guid: logger.warn(u'could not find guid for entry from %s, skipped' % netloc) continue author = get_entry_author(parsed_entry, soup.feed) title = get_entry_title(parsed_entry, default='Untitled') content_type, content = get_entry_content(parsed_entry, default=('text/plain', '')) timestamp = get_entry_timestamp(parsed_entry, default=now) # Skip ancient feed items if max_history and ((now - timestamp).days > max_history): logger.debug(u"entry %s from %s is over max_history, skipped" % (guid, netloc)) continue try: # If entry is already in database with same id, then skip it Entry.get(guid=guid) logger.debug(u"duplicated entry %s, skipped" % guid) continue except Entry.DoesNotExist: pass entry = Entry(guid=guid, feed=feed, title=title, author=author, content=content, content_type=content_type, link=link, last_updated_on=timestamp) trigger_event('entry_parsed', entry, parsed_entry) entry.save() logger.debug(u"added entry %s from %s" % (guid, netloc))