Exemple #1
0
    def on_status(self, status):
        # Check for retweet.
        retweet = status.text.startswith("RT @")

        # Get urls.
        urls = set([])
        collect_urls(status._json, urls)

        user = status.user.screen_name.lower()
        for url in urls:
            # Check for blocked sites.
            if news.blocked(url): continue

            # Check for news site. Try to crawl all urls in tweets from feeds.
            # Otherwise the site must be in the whitelist.
            site = news.sitename(url)
            if user not in users:
                if retweet and not flags.arg.retweets: continue
                if site not in news.sites: continue

            # Crawl URL.
            print("---", user, "-", news.trim_url(url))
            crawler.crawl(url)
            sys.stdout.flush()
Exemple #2
0
                     check_for_updates=False)
reddit.read_only = True

# Monitor live Reddit submission stream for news articles.
crawler = news.Crawler("reddit")
while True:
    try:
        for submission in reddit.subreddit('all').stream.submissions():
            # Ignore self submissions.
            if submission.is_self: continue

            # Discard non-news sites.
            if submission.over_18: continue
            subreddit = str(submission.subreddit)
            url = submission.url
            if news.blocked(url): continue
            site = news.sitename(url)
            if subreddit not in news_reddits:
                if subreddit in ignored_reddits: continue
                if site not in news.sites: continue

            # Crawl URL.
            domain = str(submission.domain)
            title = str(submission.title)
            print("---", domain, subreddit, "-", title)
            crawler.crawl(url)
            sys.stdout.flush()

        print("restart submission stream")
        time.sleep(20)
Exemple #3
0
    num_saved = 0
    num_dups = 0
    num_redirects = 0
    for uri, date, content in sling.WebArchive(warc):
        num_urls += 1
        if num_urls % 1000 == 0: print(warc, ":", num_urls, "urls", end="\r")

        # Trim URL.
        try:
            url = news.trim_url(uri.decode("utf8"))
        except Exception as e:
            print("Invalid URI:", uri, e)
            continue

        # Discard blocked sites.
        if news.blocked(url):
            num_blocked += 1
            continue

        # Discard large articles.
        if len(content) > flags.arg.max_article_size:
            print("Article too big:", url, ",", len(content), "bytes")
            continue

        # Get canonical URL.
        canonical = news.get_canonical_url(url, content)
        if canonical is None: canonical = url

        # Store web page under canonical URL.
        result = news.store(canonical, date, content)
        #print("%s %d [%s] %s" % (date, num_urls, result, canonical))