""" Crawl Reddit for moderator and subscriber # data and insert it into the database. """ from bs4 import BeautifulSoup import datetime import re import time import urllib3 from redditchat.core.models import Room from script_log import make_logger, log logger = make_logger("crawl_reddit") def get_rooms_to_crawl(): now = datetime.datetime.now() one_day_ago = now - datetime.timedelta(days=1) never_crawled = Room.objects.filter(last_crawled__isnull=True) not_crawled_today = Room.objects.exclude(last_crawled__isnull=True).filter(last_crawled__lt=one_day_ago) # Exclude Front Page: return (never_crawled | not_crawled_today).exclude(shortname="frontpage") def crawl_room(room, http): """ See if there's a subreddit corresponding to this room. If there is, fill in the model's moderator list, subscriber count, and shortname_display. """
""" Match XMPP affiliations in Mnesia to Reddit's moderators. MUC rooms correspond to subreddits. A user is added as an admin in the room if they are a moderator of the subreddit. """ import re import subprocess from redditchat.core.models import Room from redditchat.stagesettings import XMPP_DOMAIN, XMPP_MUC_DOMAIN from script_log import make_logger, log logger = make_logger("set_room_affiliations") ME = "badr" OWNER = "owner" ADMIN = "admin" def ejabberdctl_command(args): args = ["ejabberdctl"] + args p = subprocess.Popen(args, stdout=subprocess.PIPE) assert not p.wait() return p.stdout.read() def get_online_rooms(): """List of all online room JIDs."""
from bs4 import BeautifulSoup import urllib3 from redditchat.core.models import Room from script_log import make_logger, log logger = make_logger('seed_rooms') #URL = 'http://subredditfinder.com/redditor.php?sort=top&max=1250' # # #def fetch_top_rooms_html(): # # XXX doesn't work, we get 400, probably a Referer check # http = urllib3.PoolManager(headers={'User-Agent': 'Seddit.com - contact [email protected]'}) # r = http.request('GET', URL) # if r.status != 200: # log(logger, 'error', 'Request got error:', r.status, "on url", URL) # raise Exception(r.status, URL) # return r.data # #def extract_room_names(html): # result = [] # Use a list to preserve priority # soup = BeautifulSoup(html) # top = soup.find('a').nextSibling.nextSibling.nextSibling.nextSibling.text # result.append(top) # for br in soup.find_all('br'): # try: # result.add(br.nextSibling.nextSibling.nextSibling.nextSibling.nextSibling.text) # except AttributeError: # break
""" Crawl Reddit for moderator and subscriber # data and insert it into the database. """ from bs4 import BeautifulSoup import datetime import re import time import urllib3 from redditchat.core.models import Room from script_log import make_logger, log logger = make_logger('crawl_reddit') def get_rooms_to_crawl(): now = datetime.datetime.now() one_day_ago = now - datetime.timedelta(days=1) never_crawled = Room.objects.filter(last_crawled__isnull=True) not_crawled_today = Room.objects.exclude(last_crawled__isnull=True).filter(last_crawled__lt=one_day_ago) # Exclude Front Page: return (never_crawled | not_crawled_today).exclude(shortname='frontpage') def crawl_room(room, http): """ See if there's a subreddit corresponding to this room. If there is, fill in the model's moderator list, subscriber count, and shortname_display. """ room.last_crawled = datetime.datetime.now() room.save() # Save immediately so even if it errors we don't try again too fast