Beispiel #1
0
"""
Crawl Reddit for moderator and subscriber # data and insert it into the database.
"""

from bs4 import BeautifulSoup
import datetime
import re
import time
import urllib3

from redditchat.core.models import Room

from script_log import make_logger, log

logger = make_logger("crawl_reddit")


def get_rooms_to_crawl():
    now = datetime.datetime.now()
    one_day_ago = now - datetime.timedelta(days=1)
    never_crawled = Room.objects.filter(last_crawled__isnull=True)
    not_crawled_today = Room.objects.exclude(last_crawled__isnull=True).filter(last_crawled__lt=one_day_ago)
    # Exclude Front Page:
    return (never_crawled | not_crawled_today).exclude(shortname="frontpage")


def crawl_room(room, http):
    """
    See if there's a subreddit corresponding to this room. If there is,
    fill in the model's moderator list, subscriber count, and shortname_display.
    """
"""
Match XMPP affiliations in Mnesia to Reddit's moderators.

MUC rooms correspond to subreddits. A user is added as an admin in the room
if they are a moderator of the subreddit.
"""

import re
import subprocess

from redditchat.core.models import Room
from redditchat.stagesettings import XMPP_DOMAIN, XMPP_MUC_DOMAIN

from script_log import make_logger, log

logger = make_logger("set_room_affiliations")

ME = "badr"
OWNER = "owner"
ADMIN = "admin"


def ejabberdctl_command(args):
    args = ["ejabberdctl"] + args
    p = subprocess.Popen(args, stdout=subprocess.PIPE)
    assert not p.wait()
    return p.stdout.read()


def get_online_rooms():
    """List of all online room JIDs."""
Beispiel #3
0
from bs4 import BeautifulSoup
import urllib3

from redditchat.core.models import Room

from script_log import make_logger, log

logger = make_logger('seed_rooms')

#URL = 'http://subredditfinder.com/redditor.php?sort=top&max=1250'
#
#
#def fetch_top_rooms_html():
#    # XXX doesn't work, we get 400, probably a Referer check
#    http = urllib3.PoolManager(headers={'User-Agent': 'Seddit.com - contact [email protected]'})
#    r = http.request('GET', URL)
#    if r.status != 200:
#        log(logger, 'error', 'Request got error:', r.status, "on url", URL)
#        raise Exception(r.status, URL)
#    return r.data
#
#def extract_room_names(html):
#    result = [] # Use a list to preserve priority
#    soup = BeautifulSoup(html)
#    top = soup.find('a').nextSibling.nextSibling.nextSibling.nextSibling.text
#    result.append(top)
#    for br in soup.find_all('br'):
#        try:
#            result.add(br.nextSibling.nextSibling.nextSibling.nextSibling.nextSibling.text)
#        except AttributeError:
#            break
Beispiel #4
0
"""
Crawl Reddit for moderator and subscriber # data and insert it into the database.
"""

from bs4 import BeautifulSoup
import datetime
import re
import time
import urllib3

from redditchat.core.models import Room

from script_log import make_logger, log

logger = make_logger('crawl_reddit')

def get_rooms_to_crawl():
    now = datetime.datetime.now()
    one_day_ago = now - datetime.timedelta(days=1)
    never_crawled = Room.objects.filter(last_crawled__isnull=True)
    not_crawled_today = Room.objects.exclude(last_crawled__isnull=True).filter(last_crawled__lt=one_day_ago)
    # Exclude Front Page:
    return (never_crawled | not_crawled_today).exclude(shortname='frontpage')

def crawl_room(room, http):
    """
    See if there's a subreddit corresponding to this room. If there is,
    fill in the model's moderator list, subscriber count, and shortname_display.
    """
    room.last_crawled = datetime.datetime.now()
    room.save() # Save immediately so even if it errors we don't try again too fast