def on_status(self, status): # Check for retweet. retweet = status.text.startswith("RT @") # Get urls. urls = set([]) collect_urls(status._json, urls) user = status.user.screen_name.lower() for url in urls: # Check for blocked sites. if news.blocked(url): continue # Check for news site. Try to crawl all urls in tweets from feeds. # Otherwise the site must be in the whitelist. site = news.sitename(url) if user not in users: if retweet and not flags.arg.retweets: continue if site not in news.sites: continue # Crawl URL. print("---", user, "-", news.trim_url(url)) crawler.crawl(url) sys.stdout.flush()
check_for_updates=False) reddit.read_only = True # Monitor live Reddit submission stream for news articles. crawler = news.Crawler("reddit") while True: try: for submission in reddit.subreddit('all').stream.submissions(): # Ignore self submissions. if submission.is_self: continue # Discard non-news sites. if submission.over_18: continue subreddit = str(submission.subreddit) url = submission.url if news.blocked(url): continue site = news.sitename(url) if subreddit not in news_reddits: if subreddit in ignored_reddits: continue if site not in news.sites: continue # Crawl URL. domain = str(submission.domain) title = str(submission.title) print("---", domain, subreddit, "-", title) crawler.crawl(url) sys.stdout.flush() print("restart submission stream") time.sleep(20)
num_saved = 0 num_dups = 0 num_redirects = 0 for uri, date, content in sling.WebArchive(warc): num_urls += 1 if num_urls % 1000 == 0: print(warc, ":", num_urls, "urls", end="\r") # Trim URL. try: url = news.trim_url(uri.decode("utf8")) except Exception as e: print("Invalid URI:", uri, e) continue # Discard blocked sites. if news.blocked(url): num_blocked += 1 continue # Discard large articles. if len(content) > flags.arg.max_article_size: print("Article too big:", url, ",", len(content), "bytes") continue # Get canonical URL. canonical = news.get_canonical_url(url, content) if canonical is None: canonical = url # Store web page under canonical URL. result = news.store(canonical, date, content) #print("%s %d [%s] %s" % (date, num_urls, result, canonical))