Ejemplo n.º 1
0
# Load news site list.
news.init()

# Connect to Reddit.
with open(flags.arg.apikeys, "r") as f:
    apikeys = json.load(f)

reddit = praw.Reddit(client_id=apikeys["client_id"],
                     client_secret=apikeys["client_secret"],
                     user_agent=apikeys["user_agent"],
                     check_for_updates=False)
reddit.read_only = True

# Monitor live Reddit submission stream for news articles.
crawler = news.Crawler("reddit")
while True:
    try:
        for submission in reddit.subreddit('all').stream.submissions():
            # Ignore self submissions.
            if submission.is_self: continue

            # Discard non-news sites.
            if submission.over_18: continue
            subreddit = str(submission.subreddit)
            url = submission.url
            if news.blocked(url): continue
            site = news.sitename(url)
            if subreddit not in news_reddits:
                if subreddit in ignored_reddits: continue
                if site not in news.sites: continue
Ejemplo n.º 2
0
except Exception as e:
  print("*** XML parse error:", e, "in parsing news feed")
  sys.exit(1)

if flags.arg.newsites:
  # Check for unknown news sites.
  news.init()
  newsites = collections.defaultdict(int)
  for item in root.iter("item"):
    child = item.find("link")
    if child is None: continue
    url = child.text
    if url == "https://newslookup.com/": continue
    site = news.sitename(url)
    if site not in news.sites: newsites[site] += 1
  for site in sorted(newsites, key=newsites.get, reverse=True):
    print(newsites[site], site)
else:
  # Fetch articles.
  crawler = news.Crawler("newslookup")
  for item in root.iter("item"):
    child = item.find("link")
    if child is None: continue
    url = child.text
    if url == "https://newslookup.com/": continue
    crawler.crawl(url)

  crawler.wait()
  crawler.dumpstats()

Ejemplo n.º 3
0
for domain, site in news.sites.items():
    if site.twitter != None:
        users.add(site.twitter.lower()[1:])
        if site.twitter in user_cache:
            feeds.add(user_cache[site.twitter])
        else:
            try:
                user = api.get_user(site.twitter)
                feeds.add(str(user.id))
                print(site.twitter, user.id)
            except Exception as e:
                print("Ignore bad feed for domain", domain, ":", site.twitter,
                      e)

# Initialize news crawler.
crawler = news.Crawler("twitter")


def collect_urls(obj, urls):
    if "entities" in obj:
        entities = obj["entities"]
        for url in entities["urls"]:
            expanded_url = url["expanded_url"]
            if expanded_url.startswith("https://twitter.com/"): continue
            if expanded_url.startswith("https://www.twitter.com/"): continue
            if expanded_url.startswith("https://mobile.twitter.com/"): continue
            urls.add(expanded_url)

    if "retweeted_status" in obj:
        retweet = obj["retweeted_status"]
        collect_urls(retweet, urls)
Ejemplo n.º 4
0
Fetch news articles and put them into news archive.
"""

import requests
import sling
import sling.flags as flags
import sling.crawl.news as news

flags.define("--urls",
             help="File with urls to fetch",
             default=None,
             metavar="FILE")

flags.define("url", nargs="*", help="Article URLs to fetch", metavar="URL")

flags.parse()

news.init()
crawler = news.Crawler("fetch")

for url in flags.arg.url:
    crawler.crawl(url)

if flags.arg.urls:
    with open(flags.arg.urls) as f:
        for url in f.readlines():
            crawler.crawl(url.strip())

crawler.wait()
crawler.dumpstats()
Ejemplo n.º 5
0
    if text == None: return ""
    return text.strip().replace("\n", " ")


def get_atom_element(e, tag):
    child = e.find(tag)
    if child == None:
        child = e.find("{http://www.w3.org/2005/Atom}" + tag)
    if child == None: return ""
    text = child.text
    if text == None: return ""
    return text.strip().replace("\n", " ")


# Initialize news crawler.
crawler = news.Crawler("rss")

# Read RSS news feeds.
feeds = {}
f = open(flags.arg.feeds, "r")
rsssession = requests.Session()
for line in f.readlines():
    line = line.strip()
    if len(line) == 0 or line[0] == "#": continue
    fields = line.split(" ")
    site = fields[0]
    rss = fields[1]
    print("=== RSS feed", rss)

    # Fetch RSS feed.
    try: