def get_feed(cls, feed_url, return_latest_date=False, include_later_than=None, logger=None, username=None, password=None): auth_handler = None # Get an authentication handler if needed if username is not None and password is not None: auth_handler = cls.get_auth_handler(feed_url, username, password) # Parse the feed if auth_handler is not None: opener = urllib2.build_opener(auth_handler) feed = opener.open(feed_url) d = feedparser.parse(feed) else: d = feedparser.parse(feed_url) entries = [] latest_date = None # Stop if we didn't get a result if d is None or not hasattr(d, 'entries'): logger.warn("No entries returned from the feed, url=\"%s\"", feed_url) else: for entry in d.entries: # Get the updated or published date entry_date = cls.get_updated_date(entry) # Perform the operations that are based on the date if entry_date is not None: # If this is the latest one, then save it if latest_date is None or entry_date > latest_date: latest_date = entry_date # If the item is earlier than the date we are to include, then skip it if include_later_than is not None and entry_date <= include_later_than: if logger is not None: logger.debug("Skipping entry with date=%r, since its not later than latest_date=%r, title=\"%s\"", time.strftime('%Y-%m-%dT%H:%M:%SZ', entry_date), time.strftime('%Y-%m-%dT%H:%M:%SZ', include_later_than), entry.title) continue elif logger is not None and include_later_than is not None: logger.debug("Including entry with date=%r, since its not later than latest_date=%r, title=\"%s\"", time.strftime('%Y-%m-%dT%H:%M:%SZ', entry_date), time.strftime('%Y-%m-%dT%H:%M:%SZ', include_later_than), entry.title) elif logger is not None and include_later_than is None: logger.debug("Including entry with date=%r, since its not later than latest_date=%r, title=\"%s\"", time.strftime('%Y-%m-%dT%H:%M:%SZ', entry_date), "none", entry.title) entries.append(cls.flatten(entry)) # Return the latest date if requested if return_latest_date: return entries, latest_date else: return entries
def get_realm_and_auth_type(cls, feed_url, username, password): # Perform request and get the realm and whether or not the view uses HTTP digest or basic authentication d = feedparser.parse(feed_url) # Make sure we got a result if d is None or not hasattr(d, 'status'): return None, None # If the status isn't 401, then authentication isn't required if d.status != 401: return None, None # Make sure that an auth header exists if 'www-authenticate' not in d.headers: return None, None auth_header = d.headers['www-authenticate'] # Get the realm and whether it is using basic or digest authentication http_auth_re = re.compile("((Digest)|(Basic))( realm=[\"]?([^\"]*)[\"]?)?") match = http_auth_re.search(auth_header) auth_type = match.groups()[0] auth_realm = match.groups()[4] return auth_realm, auth_type
def get_feed(cls, feed_url, return_latest_date=False, include_later_than=None, logger=None, username=None, password=None): auth_handler = None # Get an authentication handler if needed if username is not None and password is not None: auth_handler = cls.get_auth_handler(feed_url, username, password) # Parse the feed if auth_handler is not None: opener = urllib2.build_opener(auth_handler) feed = opener.open(feed_url) d = feedparser.parse(feed) else: d = feedparser.parse(feed_url) entries = [] latest_date = None for entry in d.entries: # Get the updated or published date entry_date = cls.get_updated_date(entry) # Perform the operations that are based on the date if entry_date is not None: # If this is the latest one, then save it if latest_date is None or entry_date > latest_date: latest_date = entry_date # If the item is earlier than the date we are to include, then skip it if include_later_than is not None and entry_date <= include_later_than: if logger is not None: logger.debug("Skipping entry with date=%r, since its not later than latest_date=%r", entry_date, include_later_than) continue entries.append(cls.flatten(entry)) # Return the latest date if requested if return_latest_date: return entries, latest_date else: return entries
def get_realm_and_auth_type(cls, feed_url, username, password): # Perform request and get the realm and whether or not the view uses HTTP digest or basic authentication d = feedparser.parse(feed_url) # If the status isn't 401, then authentication isn't required if d.status != 401: return None, None auth_header = d.headers['www-authenticate'] # Get the realm and whether it is using basic or digest authentication http_auth_re = re.compile("((Digest)|(Basic))( realm=[\"]?([^\"]*)[\"]?)?") match = http_auth_re.search(auth_header) auth_type = match.groups()[0] auth_realm = match.groups()[4] return auth_realm, auth_type
def get_feed(cls, feed_url, return_latest_date=False, include_later_than=None, logger=None, username=None, password=None, clean_html=True): """ Get the feed results as a dictionary. Arguments: feed_url -- The URL of the feed to retrieve (as a string) return_latest_date -- Return the date of the latest feed entry as the second item in a tuple include_later_than -- Only return the feeds that are after this date logger -- The logger to log the data to username -- The username to use when authenticating password -- The password to use when authenticating clean_html -- If true, HTML will be convrted to something human readable """ auth_handler = None # Get an authentication handler if needed if username is not None and password is not None: auth_handler = cls.get_auth_handler(feed_url, username, password) # Parse the feed if auth_handler is not None: opener = urllib2.build_opener(auth_handler) feed = opener.open(feed_url) d = feedparser.parse(feed) else: d = feedparser.parse(feed_url) entries = [] latest_date = None # Stop if we didn't get a result if d is None or not hasattr(d, 'entries'): logger.warn("No entries returned from the feed, url=\"%s\"", feed_url) else: for entry in d.entries: # Get the updated or published date entry_date = cls.get_updated_date(entry) # Perform the operations that are based on the date if entry_date is not None: # If this is the latest one, then save it if latest_date is None or entry_date > latest_date: latest_date = entry_date # If the item is earlier than the date we are to include, then skip it if include_later_than is not None and entry_date <= include_later_than: if logger is not None: logger.debug( "Skipping entry with date=%r, since its not later than latest_date=%r, title=\"%s\"", time.strftime('%Y-%m-%dT%H:%M:%SZ', entry_date), time.strftime('%Y-%m-%dT%H:%M:%SZ', include_later_than), entry.title) continue elif logger is not None and include_later_than is not None: logger.debug( "Including entry with date=%r, since its not later than latest_date=%r, title=\"%s\"", time.strftime('%Y-%m-%dT%H:%M:%SZ', entry_date), time.strftime('%Y-%m-%dT%H:%M:%SZ', include_later_than), entry.title) elif logger is not None and include_later_than is None: logger.debug( "Including entry with date=%r, since its not later than latest_date=%r, title=\"%s\"", time.strftime('%Y-%m-%dT%H:%M:%SZ', entry_date), "none", entry.title) # Clean up the HTML if requested if clean_html: # Clean up the content try: if entry.get( 'content', None) and entry['content'][0].get( 'type', 'text/html' ) == 'text/html' and entry['content'][0].get( 'value', None): entry['content'][0]['value'] = html2text.html2text( entry['content'][0]['value']) except: logger.warn( "Unable to convert the HTML content, field=%s", "value") # Clean up the summary try: if entry.get('summary', None): entry['summary'] = html2text.html2text( entry['summary']) except: logger.warn( "Unable to convert the HTML content, field=%s", "summary") # Clean up the summary_detail try: if entry.get('summary_detail', None) and entry['summary_detail'].get( 'type', 'text/html') == 'text/html' and entry[ 'summary_detail'].get( 'value', None): entry['summary_detail'][ 'value'] = html2text.html2text( entry['summary_detail']['value']) except: logger.warn( "Unable to convert the HTML content, field=%s", "summary_detail") entries.append(cls.flatten(entry, sort=True)) # Return the latest date if requested if return_latest_date: return entries, latest_date else: return entries