def crawl_room(room, http): """ See if there's a subreddit corresponding to this room. If there is, fill in the model's moderator list, subscriber count, and shortname_display. """ room.last_crawled = datetime.datetime.now() room.save() # Save immediately so even if it errors we don't try again too fast subreddit = room.shortname log(logger, 'debug', 'Crawling', subreddit) url = "http://www.reddit.com/r/%s/about/moderators/" % subreddit r = http.request('GET', url) if r.status != 200: if subreddit not in ['tester']: # We know that "tester" gives a 403 for whatever reason log(logger, 'error', 'Request got error:', r.status, "on url", url) return soup = BeautifulSoup(r.data) # Check whether subreddit exists: if soup.find(id='noresults'): r.moderators = r.subscribers = None room.save() return # Get display shortname try: title = soup.find(id='moderator-table').h1.text except AttributeError: # We couldn't find the moderator table log(logger, 'info', 'Could not find moderator table for:', room.shortname) return assert title.startswith('moderators of ') shortname_display = title.replace('moderators of ', '') room.shortname_display = shortname_display # Get number of subscribers number = soup.find('div', 'side').find('span', 'number').text number = int(re.sub('[^0-9]', '', number)) room.subscribers = number # Get moderator list mods = soup.find(id='moderator-table').find_all('span', 'user') mods = [m.a.text for m in mods] room.moderators = mods # Get image URL room.image_url = soup.find(id='header-img').get('src') or room.image_url or '' # Shit we need another URL to get the title url = "http://www.reddit.com/r/%s/" % subreddit r = http.request('GET', url) if r.status != 200: log(logger, 'error', 'Request got error:', r.status, "on url", url) return soup = BeautifulSoup(r.data) room.title = soup.title.text # Write log(logger, 'debug', 'Setting', subreddit, 'to', room.to_dict()) room.save()
def main(): room_jids = get_online_rooms() log(logger, "debug", "Online jids:", room_jids) room_jids = filter(is_subreddit_room, room_jids) log(logger, "debug", "Valid online jids:", room_jids) for rjid in room_jids: room_node, room_host = rjid.split("@") assert room_host == XMPP_MUC_DOMAIN affiliations = get_mnesia_affiliations(room_node) # Make me owner if affiliations.get(ME) != OWNER: set_room_affiliation(ME, room_node, OWNER) # Make moderators admins try: room = get_room_by_node(room_node) except Room.DoesNotExist: # Some rooms may not be in the DB. User created (hax), # or our DB got emptied. continue reddit_mods = room.get_reddit_moderators() log(logger, "debug", "Current affiliations for room:", room.title, affiliations) log(logger, "debug", "Found mods for room:", room.title, reddit_mods) # TODO: don't set again if already set for mod_username in reddit_mods: set_room_affiliation(mod_username, room_node, ADMIN)
def crawl_room(room, http): """ See if there's a subreddit corresponding to this room. If there is, fill in the model's moderator list, subscriber count, and shortname_display. """ room.last_crawled = datetime.datetime.now() room.save() # Save immediately so even if it errors we don't try again too fast subreddit = room.shortname log(logger, "debug", "Crawling", subreddit) url = "http://www.reddit.com/r/%s/about/moderators/" % subreddit r = http.request("GET", url) if r.status != 200: if subreddit not in ["tester"]: # We know that "tester" gives a 403 for whatever reason log(logger, "error", "Request got error:", r.status, "on url", url) return soup = BeautifulSoup(r.data) # Check whether subreddit exists: if soup.find(id="noresults"): r.moderators = r.subscribers = None room.save() return # Get display shortname title = soup.find(id="moderator-table").h1.text assert title.startswith("moderators of ") shortname_display = title.replace("moderators of ", "") room.shortname_display = shortname_display # Get number of subscribers number = soup.find("div", "side").find("span", "number").text number = int(re.sub("[^0-9]", "", number)) room.subscribers = number # Get moderator list mods = soup.find(id="moderator-table").find_all("span", "user") mods = [m.a.text for m in mods] room.moderators = mods # Get image URL room.image_url = soup.find(id="header-img").get("src") or room.image_url or "" # Shit we need another URL to get the title url = "http://www.reddit.com/r/%s/" % subreddit r = http.request("GET", url) if r.status != 200: log(logger, "error", "Request got error:", r.status, "on url", url) return soup = BeautifulSoup(r.data) room.title = soup.title.text # Write log(logger, "debug", "Setting", subreddit, "to", room.to_dict()) room.save()
def main(): rooms_to_crawl = get_rooms_to_crawl() log(logger, "debug", "Got rooms to crawl:", rooms_to_crawl) crawl_rooms(rooms_to_crawl)
def add_rooms(room_names): for n in room_names: log(logger, 'debug', 'get or create', n) Room.get_or_create_by_shortname(n)
def set_room_affiliation(username, room_node, affiliation): log(logger, "debug", "Setting room affiliation:", username, room_node, affiliation) user_jid = (username + "@" + XMPP_DOMAIN).lower() args = ["set_room_affiliation", room_node, XMPP_MUC_DOMAIN, user_jid, affiliation] ejabberdctl_command(args)
room_jids = get_online_rooms() log(logger, "debug", "Online jids:", room_jids) room_jids = filter(is_subreddit_room, room_jids) log(logger, "debug", "Valid online jids:", room_jids) for rjid in room_jids: room_node, room_host = rjid.split("@") assert room_host == XMPP_MUC_DOMAIN affiliations = get_mnesia_affiliations(room_node) # Make me owner if affiliations.get(ME) != OWNER: set_room_affiliation(ME, room_node, OWNER) # Make moderators admins try: room = get_room_by_node(room_node) except Room.DoesNotExist: # Some rooms may not be in the DB. User created (hax), # or our DB got emptied. continue reddit_mods = room.get_reddit_moderators() log(logger, "debug", "Current affiliations for room:", room.title, affiliations) log(logger, "debug", "Found mods for room:", room.title, reddit_mods) # TODO: don't set again if already set for mod_username in reddit_mods: set_room_affiliation(mod_username, room_node, ADMIN) if __name__ == "__main__": log(logger, "debug", "set_room_affiliations starting") main() log(logger, "debug", "set_room_affiliations done")
def main(): rooms_to_crawl = get_rooms_to_crawl() log(logger, 'debug', 'Got rooms to crawl:', rooms_to_crawl) crawl_rooms(rooms_to_crawl)