def posts(self, categories=None): """ This method first creates a mapping of feeds to categories, then iterates through the Posts collection, finding only posts with those given feeds (and not dereferencing the related object). This will speed up the post fetch process and give us more information, quickly. The generator therefore yields post, category tuples to provide for the single pass across the posts. This method also counts the number of posts per category. This method raises an exception if not in the correct state. """ if self.state != State.Started: raise ExportError(( "Calling the posts method when not in the started state " "could cause double counting or multiple database reads." )) # Create a mapping of feed id to category feeds = { feed.id: feed.category for feed in self.feeds(categories) } # Iterate through all posts that have the given feed ids without # dereferencing the related object. Yield (post, category) tuples. # This method also counts the number of posts per category. for post in Post.objects(feed__in=feeds.keys()).no_dereference().no_cache(): category = feeds[post.feed.id] self.counts[category] += 1 yield post, category
def posts(self, categories=None): """ This method first creates a mapping of feeds to categories, then iterates through the Posts collection, finding only posts with those given feeds (and not dereferencing the related object). This will speed up the post fetch process and give us more information, quickly. The generator therefore yields post, category tuples to provide for the single pass across the posts. This method also counts the number of posts per category. This method raises an exception if not in the correct state. """ if self.state != State.Started: raise ExportError( ("Calling the posts method when not in the started state " "could cause double counting or multiple database reads.")) # Create a mapping of feed id to category feeds = {feed.id: feed.category for feed in self.feeds(categories)} # Iterate through all posts that have the given feed ids without # dereferencing the related object. Yield (post, category) tuples. # This method also counts the number of posts per category. for post in Post.objects( feed__in=feeds.keys()).no_dereference().no_cache(): category = feeds[post.feed.id] self.counts[category] += 1 yield post, category
def posts(self, categories=None): """ Returns a set of posts for the given category """ for feed in self.feeds(categories): for post in Post.objects(feed=feed): yield post
def wrangle(self, save=True): """ Converts the raw entry to standard data. If save, saves to database. Metholodolgy of wrangling is as follows: - all fields are kept in the entry except `published` and `published_parsed` since these many not contain TZ data - instead these two fields are replaced by `pubdate`. If there is no publication date, `pubdate` is set to None. - the tags field, if it exists, is converted to a list of strings. Although this may cause some data loss; it will make tagging of all posts simpler for the application. - link will be renamed url - content will be populated with summary, if content does not exist in the feed. Supposedly feedparser was already doing this, but it appears to not be regular. - title, url, content, and tags will all be encoded UTF-8. - removes the id field so a Mongo generated ObjectID is stored. See the models.Post for more information on the data structure. NOTE: This method is destructive, the raw entry will be converted. """ ## Don't rewrangle an already wrangled post if self.is_wrangled(): return self.post ## Saves typing self.post everywhere post = self.post.copy() ## Remove unwanted fields for field in FEEDPARSER_REMOVABLE_FIELDS: if field in post: del post[field] ## Handle the pubdate and published strings post['pubdate'] = dtparser.parse(post.pop('published')) if 'published' in post else None ## Handle the tags in the entry post['tags'] = [tag['term'] for tag in self.post.tags] if 'tags' in post else [] ## Rename the link field to url post['url'] = self.post.link or post.get('href', None) or self.post.id if 'link' in post: del post['link'] ## Handle the content if 'content' not in post: post['content'] = post.get('summary') else: selected = None for idx, item in enumerate(post['content']): if idx == 0: # Take the first item selected = item elif item['type'] == 'text/html': # Unless we find another item that is html selected = item # Update the post with the content info post['language'] = selected.get('language') post['mimetype'] = selected.get('type') post['content'] = selected.get('value') ## Create the post object ## Start using self.post here! self.post = Post(feed=self.feed, **post) if save: self.post.save() return self.post
class PostWrangler(object): """ As FeedSync wraps Feed to do work, so to does PostWrangler wrap an entry to create a Post object, to ensure that data is of a high quality, and to do extra things like fetch the full webpage from the URL provided. This object directly converts its input (a dict) to a models.Post object. """ @classmethod def factory(klass, entries, feed=None): """ Yields a post wrangler for each entry in the entries. """ for entry in entries: yield klass(deepcopy(entry), feed=feed) def __init__(self, entry, feed=None): """ Entry is expected to be the dictionary object from a FeedSync After wrangling, it will become a models.Post object. """ self.feed = feed self.post = entry def is_wrangled(self): """ Checks the class of the post to see if wrangling has occurred. """ return isinstance(self.post, Post) @reraise(klass=WranglingError) def wrangle(self, save=True): """ Converts the raw entry to standard data. If save, saves to database. Metholodolgy of wrangling is as follows: - all fields are kept in the entry except `published` and `published_parsed` since these many not contain TZ data - instead these two fields are replaced by `pubdate`. If there is no publication date, `pubdate` is set to None. - the tags field, if it exists, is converted to a list of strings. Although this may cause some data loss; it will make tagging of all posts simpler for the application. - link will be renamed url - content will be populated with summary, if content does not exist in the feed. Supposedly feedparser was already doing this, but it appears to not be regular. - title, url, content, and tags will all be encoded UTF-8. - removes the id field so a Mongo generated ObjectID is stored. See the models.Post for more information on the data structure. NOTE: This method is destructive, the raw entry will be converted. """ ## Don't rewrangle an already wrangled post if self.is_wrangled(): return self.post ## Saves typing self.post everywhere post = self.post.copy() ## Remove unwanted fields for field in FEEDPARSER_REMOVABLE_FIELDS: if field in post: del post[field] ## Handle the pubdate and published strings post['pubdate'] = dtparser.parse(post.pop('published')) if 'published' in post else None ## Handle the tags in the entry post['tags'] = [tag['term'] for tag in self.post.tags] if 'tags' in post else [] ## Rename the link field to url post['url'] = self.post.link or post.get('href', None) or self.post.id if 'link' in post: del post['link'] ## Handle the content if 'content' not in post: post['content'] = post.get('summary') else: selected = None for idx, item in enumerate(post['content']): if idx == 0: # Take the first item selected = item elif item['type'] == 'text/html': # Unless we find another item that is html selected = item # Update the post with the content info post['language'] = selected.get('language') post['mimetype'] = selected.get('type') post['content'] = selected.get('value') ## Create the post object ## Start using self.post here! self.post = Post(feed=self.feed, **post) if save: self.post.save() return self.post @reraise(klass=FetchError) def fetch(self, save=True): """ Fetches the entire webpage for the post. If save, adds the page to the content of the post and saves it back to the database. Raises an exception if not wrangled yet. Raises exceptions if there is a problem with the fetch. """ if not self.is_wrangled(): raise ValueError("Entry not yet wrangled, cannot fetch.") response = requests.get(self.post.url, timeout=settings.timeout) response.raise_for_status() if response.text: self.post.content = response.text if save: self.post.save() return self.post
class PostWrangler(object): """ As FeedSync wraps Feed to do work, so to does PostWrangler wrap an entry to create a Post object, to ensure that data is of a high quality, and to do extra things like fetch the full webpage from the URL provided. This object directly converts its input (a dict) to a models.Post object. """ @classmethod def factory(klass, entries, feed=None): """ Yields a post wrangler for each entry in the entries. """ for entry in entries: yield klass(deepcopy(entry), feed=feed) def __init__(self, entry, feed=None): """ Entry is expected to be the dictionary object from a FeedSync After wrangling, it will become a models.Post object. """ self.feed = feed self.post = entry def is_wrangled(self): """ Checks the class of the post to see if wrangling has occurred. """ return isinstance(self.post, Post) @reraise(klass=WranglingError) def wrangle(self, save=True): """ Converts the raw entry to standard data. If save, saves to database. Metholodolgy of wrangling is as follows: - all fields are kept in the entry except `published` and `published_parsed` since these many not contain TZ data - instead these two fields are replaced by `pubdate`. If there is no publication date, `pubdate` is set to None. - the tags field, if it exists, is converted to a list of strings. Although this may cause some data loss; it will make tagging of all posts simpler for the application. - link will be renamed url - content will be populated with summary, if content does not exist in the feed. Supposedly feedparser was already doing this, but it appears to not be regular. - title, url, content, and tags will all be encoded UTF-8. - removes the id field so a Mongo generated ObjectID is stored. See the models.Post for more information on the data structure. NOTE: This method is destructive, the raw entry will be converted. """ ## Don't rewrangle an already wrangled post if self.is_wrangled(): return self.post ## Saves typing self.post everywhere post = self.post.copy() ## Remove unwanted fields for field in FEEDPARSER_REMOVABLE_FIELDS: if field in post: del post[field] ## Handle the pubdate and published strings post['pubdate'] = dtparser.parse(post.pop('published')) if 'published' in post else None ## Handle the tags in the entry post['tags'] = [tag['term'] for tag in self.post.tags] if 'tags' in post else [] ## Rename the link field to url post['url'] = self.post.link or post.get('href', None) or self.post.id if 'link' in post: del post['link'] ## Handle the content if 'content' not in post: post['content'] = post.get('summary') else: selected = None for idx, item in enumerate(post['content']): if idx == 0: # Take the first item selected = item elif item['type'] == 'text/html': # Unless we find another item that is html selected = item # Update the post with the content info post['language'] = selected.get('language') post['mimetype'] = selected.get('type') post['content'] = selected.get('value') ## Create the post object ## Start using self.post here! self.post = Post(feed=self.feed, **post) if save: self.post.save() return self.post @reraise(klass=FetchError) def fetch(self, save=True): """ Fetches the entire webpage for the post. If save, adds the page to the content of the post and saves it back to the database. Raises an exception if not wrangled yet. Raises exceptions if there is a problem with the fetch. """ if not self.is_wrangled(): raise ValueError("Entry not yet wrangled, cannot fetch.") response = requests.get(self.post.url) response.raise_for_status() if response.text: self.post.content = response.text if save: self.post.save() return self.post