def test_faulty_dates2(self): entries = [ {"title": u"first", "updated": u"06/01/2010 CET", "published_parsed": None}, {"title": u"second", "updated": u"23/12/2009 CET", "published_parsed": None}, ] entries = entries_by_date(entries) d1 = date_to_datetime("published_parsed")(None, entries[0]) d2 = date_to_datetime("published_parsed")(None, entries[1]) self.assertTrue(d1 > d2) self.assertEqual(entries, entries_by_date(entries)) reversed_entries = list(entries) reversed_entries.reverse() self.assertNotEqual(entries, reversed_entries) self.assertEqual(entries, entries_by_date(reversed_entries))
def test_faulty_dates2(self): entries = [ { "title": u"first", "updated": u"06/01/2010 CET", "updated_parsed": None }, { "title": u"second", "updated": u"23/12/2009 CET", "updated_parsed": None }, ] entries = entries_by_date(entries) d1 = date_to_datetime("published_parsed")(None, entries[0]) d2 = date_to_datetime("published_parsed")(None, entries[1]) self.assertTrue(d1 > d2) self.assertEqual(entries, entries_by_date(entries)) reversed_entries = list(entries) reversed_entries.reverse() self.assertNotEqual(entries, reversed_entries) self.assertEqual(entries, entries_by_date(reversed_entries))
def test_no_date(self): x = date_to_datetime("date_test") date = x(None, {}) now = datetime.now(pytz.utc) self.assertTupleEqual((date.year, date.month, date.day), (now.year, now.month, now.day))
def test_wrong_type(self): x = date_to_datetime("date_test") date = x(None, {"date_test": object()}) now = datetime.now(pytz.utc) self.assertTupleEqual((date.year, date.month, date.day), (now.year, now.month, now.day))
class FeedImporter(object): """Import/Update feeds. :keyword post_limit: See :attr`post_limit`. :keyword update_on_import: See :attr:`update_on_import`. :keyword logger: See :attr:`logger`. :keyword include_categories: See :attr:`include_categories`. :keyword include_enclosures: See :attr:`include_enclosures`. :keyword timeout: See :attr:`timeout`. .. attribute:: post_limit Default number of posts limit. .. attribute:: update_on_import By default, fetch new posts when a feed is imported .. attribute:: logger The :class:`logging.Logger` instance used for logging messages. .. attribute:: include_categories By default, include feed/post categories. .. attribute:: include_enclosures By default, include post enclosures. .. attribute:: timeout Default feed timeout. .. attribute:: parser The feed parser used. (Default: :mod:`feedparser`.) """ parser = feedparser post_limit = conf.DEFAULT_POST_LIMIT include_categories = conf.STORE_CATEGORIES include_enclosures = conf.STORE_ENCLOSURES update_on_import = True post_model = models.Post feed_model = models.Feed category_model = models.Category enclosure_model = models.Enclosure post_field_handlers = { "content": feedutil.find_post_content, "date_published": feedutil.date_to_datetime("published_parsed"), "date_updated": feedutil.date_to_datetime("updated_parsed"), "link": lambda feed_obj, entry: entry.get("link") or feed_obj.feed_url, "feed": lambda feed_obj, entry: feed_obj, "guid": feedutil.get_entry_guid, "title": lambda feed_obj, entry: entry.get("title", "(no title)").strip(), "author": lambda feed_obj, entry: entry.get("author", "").strip(), } def __init__(self, **kwargs): self.post_limit = kwargs.get("post_limit", self.post_limit) self.update_on_import = kwargs.get("update_on_import", self.update_on_import) self.logger = kwargs.get("logger", get_default_logger()) self.include_categories = kwargs.get("include_categories", self.include_categories) self.include_enclosures = kwargs.get("include_enclosures", self.include_enclosures) self.timeout = kwargs.get("timeout", conf.FEED_TIMEOUT) self.backend = backend_or_default(kwargs.get("backend")) self.post_model = self.backend.get_post_model() def parse_feed(self, feed_url, etag=None, modified=None, timeout=None, maxlen=None): """Parse feed using the current feed parser. :param feed_url: URL to the feed to parse. :keyword etag: E-tag recevied from last parse (if any). :keyword modified: ``Last-Modified`` HTTP header received from last parse (if any). :keyword timeout: Parser timeout in seconds. """ prev_timeout = socket.getdefaulttimeout() timeout = timeout or self.timeout socket.setdefaulttimeout(timeout) try: if maxlen: headers = self.early_headers(feed_url) contentlen = int(headers.get("content-length") or 0) if contentlen > maxlen: raise exceptions.FeedCriticalError( unicode(models.FEED_GENERIC_ERROR_TEXT)) feed = self.parser.parse(feed_url, etag=etag, modified=modified) finally: socket.setdefaulttimeout(prev_timeout) return feed def early_headers(self, feed_url): class HeadRequest(urllib2.Request): def get_method(self): return "HEAD" return urllib2.urlopen(HeadRequest(feed_url)).headers def real_headers(self, feed_url): return urllib2.urlopen(urllib2.Request(feed_url)) def import_feed(self, feed_url, force=None, local=False): """Import feed. If feed is not seen before it will be created, otherwise just updated. :param feed_url: URL to the feed to import. :keyword force: Force import of feed even if it's been updated recently. """ feed_url = feed_url.strip() feed = None try: feed_obj = self.feed_model.objects.get(feed_url=feed_url) except self.feed_model.DoesNotExist: try: feed = self.parse_feed(feed_url) except socket.timeout: self.feed_model.objects.create(feed_url=feed_url, sort=0) raise exceptions.TimeoutError( unicode(models.FEED_TIMEDOUT_ERROR_TEXT)) except Exception: feed = {"status": 500} default_status = http.OK if local else http.NOT_FOUND status = feed.get("status", default_status) if status == http.NOT_FOUND: raise exceptions.FeedNotFoundError( unicode(models.FEED_NOT_FOUND_ERROR_TEXT)) if status not in models.ACCEPTED_STATUSES: raise exceptions.FeedCriticalError(unicode( models.FEED_GENERIC_ERROR_TEXT), status=status) # Feed can be local/fetched with a HTTP client. status = feed.get("status") or feed.get("status\n") or http.OK if status == http.FOUND or status == http.MOVED_PERMANENTLY: if feed_url != feed.href: return self.import_feed(feed.href, force=force) feed_name = feed.channel.get("title", "(no title)").strip() feed_data = truncate_field_data( self.feed_model, { "sort": 0, "name": feed_name, "description": feed.channel.get("description", ""), }) feed_obj = self.feed_model.objects.update_or_create( feed_url=feed_url, **feed_data) if self.include_categories: feed_obj.categories.add(*self.get_categories(feed.channel)) if self.update_on_import: feed_obj = self.update_feed(feed_obj, feed=feed, force=force) return feed_obj def get_categories(self, obj): """Get and save categories.""" return [ self.create_category(*cat) for cat in getattr(obj, "categories", []) ] def create_category(self, domain, name): """Create new category. :param domain: The category domain. :param name: The name of the category. """ return self.category_model.objects.update_or_create( name=name.strip(), domain=domain and domain.strip() or "") def update_feed(self, feed_obj, feed=None, force=False): """Update (refresh) feed. The feed must already exist in the system, if not you have to import it using :meth:`import_feed`. :param feed_obj: the Feed object :keyword feed: If feed has already been parsed you can pass the structure returned by the parser so it doesn't have to be parsed twice. :keyword force: Force refresh of the feed even if it has been recently refreshed already. """ now = datetime.utcnow().replace(tzinfo=utc) already_fresh = ( feed_obj.date_last_refresh and now < feed_obj.date_last_refresh + conf.MIN_REFRESH_INTERVAL) if already_fresh and not force: self.logger.info("Feed %s is fresh. Skipping refresh." % feed_obj.feed_url) return feed_obj limit = self.post_limit if not feed: last_modified = None if feed_obj.http_last_modified and not force: last_modified = feed_obj.http_last_modified.timetuple() etag = feed_obj.http_etag if not force else None try: feed = self.parse_feed(feed_obj.feed_url, etag=etag, modified=last_modified) except socket.timeout: return feed_obj.save_timeout_error() except Exception: return feed_obj.save_generic_error() # Feed can be local/ not fetched with HTTP client. status = feed.get("status", http.OK) if status == http.NOT_MODIFIED and not force: return feed_obj if feed_obj.is_error_status(status): return feed_obj.set_error_status(status) if feed.entries: sorted_by_date = feedutil.entries_by_date(feed.entries, limit) for entry in sorted_by_date: self.import_entry(entry, feed_obj) feed_obj.date_last_refresh = now feed_obj.http_etag = feed.get("etag", "") if hasattr(feed, "modified") and feed.modified: try: as_ts = time.mktime(feed.modified) feed_obj.http_last_modified = datetime.fromtimestamp( as_ts).replace(tzinfo=utc) except TypeError: pass self.logger.debug("uf: %s Saving feed object..." % (feed_obj.feed_url)) feed_obj.save() return feed_obj def create_enclosure(self, **kwargs): """Create new enclosure.""" kwargs.setdefault("length", 0) return self.enclosure_model.objects.update_or_create(**kwargs) def get_enclosures(self, entry): """Get and create enclosures for feed.""" return [ self.create_enclosure(url=enclosure.href, length=enclosure.length, type=enclosure.type) for enclosure in getattr(entry, "enclosures", []) if enclosure and hasattr(enclosure, "length") ] def post_fields_parsed(self, entry, feed_obj): """Parse post fields.""" return dict((key, handler(feed_obj, entry)) for key, handler in self.post_field_handlers.items()) def import_entry(self, entry, feed_obj): """Import feed post entry.""" self.logger.debug("ie: %s Importing entry..." % feed_obj.feed_url) fields = self.post_fields_parsed(entry, feed_obj) post = self.post_model.objects.update_or_create(feed_obj, **fields) if self.include_enclosures: post.enclosures.add(*(self.get_enclosures(entry) or [])) if self.include_categories: post.categories.add(*(self.get_categories(entry) or [])) self.logger.debug("ie: %s Post successfully imported..." % (feed_obj.feed_url)) return post