def test_parsing_valid_feeds(self, feed): note(feed.feed) note(feed.items) with patch( "urllib.request.OpenerDirector.open", new=partial(self.patch_open, feed), ): contents, errors = feedergrabber(feed.feed["link"]) if contents is None: note(errors) self.assertEqual(0, len(feed.items)) self.assertEqual(1, len(errors)) self.assertIn("Parsing methods not successful", errors[0]) else: for i, (link, title, date, content) in enumerate(contents): item = feed.items[i] self.assertEqual(link, item["link"]) item_date = item.get("pubdate", item.get("updateddate")) note(item_date) note(date) self.assertIsNotNone(date) self.assertGreaterEqual( datetime.datetime.now().utctimetuple(), date.utctimetuple(), )
def test_parsing_feeds_with_min_dates(self): with patch("urllib2.OpenerDirector.open", new=self.min_date_feed): contents, errors = feedergrabber("http://max.computer/index.html") self.assertIsNone(contents) self.assertEqual(2, len(errors)) self.assertIn("Parsing methods not successful", errors[-1]) self.assertIn("hugo page", errors[0])
def crawlblog(self, blog): # Feedergrabber returns ( [(link, title, date, content)], [errors]) crawled, errors = feedergrabber27.feedergrabber(blog.feed_url) if not crawled: log.debug('\n'.join(errors)) return log.debug('Crawled %s posts from %s', len(crawled), blog.feed_url) if errors: log.debug('\n'.join(errors)) blog.last_crawled = timezone.now() blog.save(update_fields=['last_crawled']) created_count = 0 for link, title, date, content in crawled: date = timezone.make_aware(date, timezone.get_default_timezone()) # create the post instance if it doesn't already exist post, created = get_or_create_post( blog, title, link, date, content ) if created: created_count += 1 log.debug("Created '%s' from blog '%s'", title, blog.feed_url) # Throttle the amount of new posts that can be announced per # user per crawl. if created_count <= settings.MAX_POST_ANNOUNCE: self.zulip_queue.append(post) else: update_post(post, title, link, content)
def crawlblog(self, blog): # Feedergrabber returns ( [(link, title, date, content)], [errors]) print(f"Crawling {blog.feed_url} ...") crawled, errors = feedergrabber27.feedergrabber(blog.feed_url) if not crawled: log.debug("\n".join(errors)) return log.debug("Crawled %s posts from %s", len(crawled), blog.feed_url) if errors: log.debug("\n".join(errors)) blog.last_crawled = timezone.now() blog.save(update_fields=["last_crawled"]) created_count = 0 for link, title, date, content in crawled: date = timezone.make_aware(date, timezone.get_default_timezone()) # create the post instance if it doesn't already exist post, created = get_or_create_post(blog, title, link, date, content) if created: created_count += 1 log.debug("Created '%s' from blog '%s'", title, blog.feed_url) # Throttle the amount of new posts that can be announced per # user per crawl. if created_count <= settings.MAX_POST_ANNOUNCE: self.zulip_queue.append(post) else: update_post(post, title, link, content)
def crawlblog(self, blog): # Feedergrabber returns ( [(link, title, date)], [errors]) # We're ignoring the errors returned for right now crawled, errors = feedergrabber27.feedergrabber(blog.feed_url) if crawled: for link, title, date in crawled: date = timezone.make_aware(date, timezone.get_default_timezone()) now = timezone.make_aware(datetime.datetime.now(), timezone.get_default_timezone()) title = cleantitle(title) # create the post instance if it doesn't already exist post, created = Post.objects.get_or_create( blog = blog, url = link, defaults = { 'title': title, 'date_updated': date, } ) if created: print "Created '%s' from blog '%s'" % (title, blog.feed_url) # Only post to zulip if the post was created recently # so that new accounts don't spam zulip with their entire post list if (now - date) < max_zulip_age: post_page = ROOT_URL + 'post/' + Post.objects.get(url=link).slug send_message_zulip(user=blog.user, link=post_page, title=title) # subscribe the author to comment updates # subscription, created = Comment_Subscription.objects.get_or_create( # user = blog.user, # post = post, # ) # if new info, update the posts if not created: # print ".", updated = False if date != post.date_updated: post.date_updated = date updated = True if title != post.title: post.title = title updated = True if updated: print "Updated %s in %s." % (title, blog.feed_url) post.save() else: log.debug(str(errors))
def crawlblog(self, blog): print "** CRAWLING", blog.feed_url # Feedergrabber returns ( [(link, title, date)], [errors]) # We're ignoring the errors returned for right now crawled, errors = feedergrabber27.feedergrabber(blog.feed_url) if crawled: for link, title, date in crawled: date = timezone.make_aware(date, timezone.get_default_timezone()) now = timezone.make_aware(datetime.datetime.now(), timezone.get_default_timezone()) title = cleantitle(title) # create the post instance if it doesn't already exist post, created = Post.objects.get_or_create(blog=blog, url=link, defaults={ 'title': title, 'date_updated': date, }) if created: print "Created", title # Only post to humbug if the post was created in the last 2 days # so that new accounts don't spam humbug with their entire post list if (now - date) < datetime.timedelta(days=2): send_message_humbug(user=blog.user, link=link, title=title) # if new info, update the posts if not created: print "Retrieved", title updated = False if date != post.date_updated: post.date_updated = date updated = True if title != post.title: post.title = title updated = True if updated: print "Updated", title post.save() else: log.debug(str(errors))
def crawlblog(self, blog): # Feedergrabber returns ( [(link, title, date)], [errors]) # We're ignoring the errors returned for right now crawled, errors = feedergrabber27.feedergrabber(blog.feed_url) if crawled: for link, title, date in crawled: date = timezone.make_aware(date, timezone.get_default_timezone()) now = timezone.make_aware(datetime.datetime.now(), timezone.get_default_timezone()) title = cleantitle(title) # create the post instance if it doesn't already exist post, created = Post.objects.get_or_create(blog=blog, url=link, defaults={ 'title': title, 'date_updated': date, }) if created: print "Created '%s' from blog '%s'" % (title, blog.feed_url) # Only post to zulip if the post was created recently # so that new accounts don't spam zulip with their entire post list if (now - date) < max_zulip_age: post_page = ROOT_URL + 'post/' + Post.objects.get( url=link).slug self.enqueue_zulip(self.zulip_queue, blog.user, post_page, title, blog.stream) # if new info, update the posts if not created: updated = False if date != post.date_updated: post.date_updated = date updated = True if title != post.title: post.title = title updated = True if updated: print "Updated %s in %s." % (title, blog.feed_url) post.save() else: log.debug(str(errors))
def test_parsing_broken_feeds(self, feed): note(feed.feed) note(feed.items) with patch('urllib2.OpenerDirector.open', new=partial(self.patch_open_broken_feed, feed)): contents, errors = feedergrabber(feed.feed['link']) note(contents) note(errors) self.assertIsNone(contents) self.assertEqual(len(feed.items) + 1, len(errors)) self.assertIn('Parsing methods not successful', errors[-1][0])
def test_parsing_broken_feeds(self, feed): note(feed.feed) note(feed.items) with patch( "urllib.request.OpenerDirector.open", new=partial(self.patch_open_broken_feed, feed), ): contents, errors = feedergrabber(feed.feed["link"]) note(contents) note(errors) self.assertIsNone(contents) self.assertEqual(len(feed.items) + 1, len(errors)) self.assertIn("Parsing methods not successful", errors[-1])
def crawlblog(self, blog): print "\n** CRAWLING", blog.feed_url # Feedergrabber returns ( [(link, title, date)], [errors]) # We're ignoring the errors returned for right now crawled, errors = feedergrabber27.feedergrabber(blog.feed_url) if crawled: for link, title, date in crawled: date = timezone.make_aware(date, timezone.get_default_timezone()) now = timezone.make_aware(datetime.datetime.now(), timezone.get_default_timezone()) title = cleantitle(title) # create the post instance if it doesn't already exist post, created = Post.objects.get_or_create( blog = blog, url = link, defaults = { 'title': title, 'date_updated': date, } ) if created: print "Created", title # Only post to humbug if the post was created in the last 2 days # so that new accounts don't spam humbug with their entire post list if (now - date) < datetime.timedelta(days=2): post_page = ROOT_URL + 'post/' + Post.objects.get(url=link).slug send_message_humbug(user=blog.user, link=post_page, title=title) # if new info, update the posts if not created: print ".", updated = False if date != post.date_updated: post.date_updated = date updated = True if title != post.title: post.title = title updated = True if updated: print "Updated", title post.save() else: log.debug(str(errors))
def crawlblog(self, blog): # Feedergrabber returns ( [(link, title, date)], [errors]) # We're ignoring the errors returned for right now crawled, errors = feedergrabber27.feedergrabber(blog.feed_url) if crawled: post_count = 0 for link, title, date in crawled: date = timezone.make_aware(date, timezone.get_default_timezone()) now = timezone.make_aware(datetime.datetime.now(), timezone.get_default_timezone()) title = cleantitle(title) # create the post instance if it doesn't already exist post, created = Post.objects.get_or_create( blog = blog, url = link, defaults = { 'title': title, 'date_updated': date, } ) if created: print "Created '%s' from blog '%s'" % (title, blog.feed_url) # Throttle the amount of new posts that can be announced per user per crawl. if post_count < MAX_POST_ANNOUNCE: post_page = ROOT_URL + 'post/' + Post.objects.get(url=link).slug self.enqueue_zulip(self.zulip_queue, blog.user, post_page, title, blog.stream) post_count += 1 # if new info, update the posts if not created: updated = False if date != post.date_updated: post.date_updated = date updated = True if title != post.title: post.title = title updated = True if updated: print "Updated %s in %s." % (title, blog.feed_url) post.save() else: log.debug(str(errors))
def handle_noargs(self, **options): for blog in Blog.objects.all(): # Feedergrabber returns ( [(link, title, date)], [errors]) # We're ignoring the errors returned for right now crawled, _ = feedergrabber27.feedergrabber(blog.feed_url) if crawled: for link, title, date in crawled: date = timezone.make_aware(date, timezone.get_default_timezone()) # create the post instance if it doesn't already exist post, created = Post.objects.get_or_create( blog=blog, url=link, defaults={"title": title, "date_updated": date} ) if created: print "Created", title send_message_hb(user=blog.user, link=link, title=title) # if new info, update the posts if not created: print "Retrieved", title updated = False if date != post.date_updated: post.date_updated = date updated = True if title != post.title: post.title = title updated = True if updated: print "Updated", title post.save() if options["dry_run"]: transaction.rollback() print "\nDON'T FORGET TO RUN THIS FOR REAL\n" else: transaction.commit()
def crawlblog(self, blog): # Feedergrabber returns ( [(link, title, date)], [errors]) # We're ignoring the errors returned for right now crawled, errors = feedergrabber27.feedergrabber(blog.feed_url) if not crawled: log.debug(str(errors)) return log.debug('Crawled %s blogs from %s', len(crawled), blog.feed_url) created_count = 0 for link, title, date, content in crawled: date = timezone.make_aware(date, timezone.get_default_timezone()) title = cleantitle(title) # create the post instance if it doesn't already exist post, created = get_or_create_post(blog, title, link, date, content) if created: created_count += 1 log.debug("Created '%s' from blog '%s'", title, blog.feed_url) # Throttle the amount of new posts that can be announced per # user per crawl. if created_count <= settings.MAX_POST_ANNOUNCE: post_page = ROOT_URL + 'post/' + post.slug self.enqueue_zulip(blog.user, post_page, title, blog.stream) # if title changes, update the post elif title != post.title or content != post.content: post.title = title post.content = content print "Updated %s in %s." % (title, blog.feed_url) post.save() else: # Any other updates are ignored, as of now pass
def test_parsing_valid_feeds(self, feed): note(feed.feed) note(feed.items) with patch('urllib2.OpenerDirector.open', new=partial(self.patch_open, feed)): contents, errors = feedergrabber(feed.feed['link']) if contents is None: note(errors) self.assertEqual(0, len(feed.items)) self.assertEqual(1, len(errors)) self.assertIn('Parsing methods not successful', errors[0][0]) else: for i, (link, title, date, content) in enumerate(contents): item = feed.items[i] self.assertEqual(link, item['link']) item_date = item.get('pubdate', item.get('updateddate')) note(item_date) note(date) self.assertIsNotNone(date) self.assertGreaterEqual( datetime.datetime.now().utctimetuple(), date.utctimetuple() )