def checkLogin(): global isLoggedIn cur.execute("select cookie, modhash from session") if cur.rowcount is 0: return else: for s in cur.fetchall(): opener.addheaders.append(('Cookie', 'reddit_session=%s' % s[0])) opener.addheaders.append(('X-Modhash', s[1])) try: success = False for i in range(_['http_retries']): f = opener.open('http://www.reddit.com/api/me.json') if f.getcode() == 200: success = True break else: log.write('Error %d for login status check attempt' % f.getcode(), 'error') if f.getcode() in [401, 403, 404]: return time.sleep(_['sleep']) if success == False: log.write('Retries exhausted for login status check', 'error'); return time.sleep(_['sleep']) except Exception, e: log.write('Error checking login status: %e' %e, 'error') return
def getCommentTree(nodes, url, linkid, commentid, args, depth): global ccount for node in nodes: try: if node is None: break elif node['kind'] == 't1': try: cur.execute("""replace into t1 ( id, link_id, parent_id, body, author, created, last_seen ) values (%s, %s, %s, %s, %s, %s, now())""", ( lib.base36decode(node['data']['id']), node['data']['link_id'], node['data']['parent_id'], node['data']['body'], node['data']['author'], datetime.datetime.fromtimestamp(node['data']['created_utc']) )) db.commit() ccount += 1 if node['data']['replies'] != "": getCommentTree([node['data']['replies']], url, linkid, commentid, args, depth) except Exception, e: log.write('Error storing t1_' + node['data']['id'] + ': %s' % e, 'error') db.rollback() elif node['kind'] == "Listing": getCommentTree(node['data']['children'], url, linkid, commentid, args, depth) elif node['kind'] == "more": if _['autoget_lte_20'] and node['data']['count'] <= 20 and node['data']['count'] >= _['autoget_threshold']: children = ",".join(node['data']['children']) time.sleep(_['sleep']) get('http://www.reddit.com/api/morechildren/', linkid, "", "api_type=json&depth=8&link_id=%s&children=%s" % (linkid, children), 0, True) elif node['data']['count'] >= _['comment_traverse_threshold']: if node['data']['parent_id'] == linkid or node['data']['parent_id'] == commentid: #sibling traversal breadth = 0 for child in node['data']['children']: if breadth >= _['comment_siblings_total']: break time.sleep(_['sleep']) get(url, linkid, child, args, depth) breadth += 1 else: #child traversal time.sleep(_['sleep']) get(url, linkid, node['data']['parent_id'][3:], args, depth + 1)
def printStats(): cur.execute("select count(*) from crawl_locations") locationTotal = cur.fetchone()[0] linkLocations = len(linkTimes['counts']) linkCount = sum(linkTimes['counts']) linkElapsedTime = sum(linkTimes['times']) if linkCount == 0: linkRate = 0 else: linkRate = linkElapsedTime / linkCount cur.execute("select count(*) from t3") linkTotal = cur.fetchone()[0] commentLocations = len(commentTimes['counts']) commentCount = sum(commentTimes['counts']) commentElapsedTime = sum(commentTimes['times']) if commentCount == 0: commentRate = 0 else: commentRate = commentElapsedTime / commentCount cur.execute("select count(*) from t1") commentTotal = cur.fetchone()[0] responseLocations = len(responseTimes['counts']) responseCount = sum(responseTimes['counts']) responseElapsedTime = sum(responseTimes['times']) if responseCount == 0: responseRate = 0 else: responseRate = responseElapsedTime / responseCount cur.execute("select count(*) from responses") responseTotal = cur.fetchone()[0] totalElapsed = time.time() - startTime log.write("%d link(s) / %f sec. (%f sec. ea.) in %d location(s)" % (linkCount, linkElapsedTime, linkRate, linkLocations), 'stat') log.write("%d comment(s) / %f sec. (%f sec. ea.) in %d thread(s)" % (commentCount, commentElapsedTime, commentRate, commentLocations), 'stat') log.write("%d response(s) / %f sec. (%f sec. ea.) in %d thread(s)" % (responseCount, responseElapsedTime, responseRate, responseLocations), 'stat') log.write("%d location(s) / %d link(s) / %d comment(s) / %d responses currently in database" % (locationTotal, linkTotal, commentTotal, responseTotal), 'stat') log.write("Execution took %f sec. (%f minutes)" % (totalElapsed, totalElapsed / 60), 'stat')
def build(crawl_subreddits, crawl_urls): urls = [] log.write("Building location list...", 'message') for subreddit in crawl_subreddits: for sort in subreddit['sort']: if sort == "all": sort = "" urls.append("http://www.reddit.com/r/" + subreddit['subreddit'] + "/" + sort + ".json") for url in crawl_urls: urls.append(url + ".json") for url in urls: try: cur.execute("select id from crawl_locations where url = %s", (url,)) if cur.rowcount > 0: cur.execute("update crawl_locations set last_seen = now() where url = %s", (url,)) else: cur.execute("""insert into crawl_locations ( url, last_seen, last_crawled ) values (%s, now(), 0)""", ( url )) db.commit() except Exception, e: log.write('Error storing location: ' + url + ': %s' % e, 'error') db.rollback()
from conf import _ from init import db, cur import comments import lib import links import locations import log import respond import stats import user # Delete old links and comments if 'runall' in argv or 'cleanup' in argv: if _['delete_links_after'] > -1: cur.execute("delete from t3 where created < date_sub(now(), interval %s second)", (_['delete_links_after'],)) if _['delete_comments_after'] > -1: cur.execute("delete from t1 where created < date_sub(now(), interval %s second)", (_['delete_comments_after'],)) db.commit(); # Build/store locations to retrieve links if 'runall' in argv or 'locations' in argv: locations.build(_['crawl_subreddits'], _['crawl_urls']) # Crawls URLS from locations if 'runall' in argv or 'links' in argv: cur.execute("select id, url from crawl_locations where last_crawled < date_sub(now(), interval %s second)", (_['find_links_after'],)) for l in cur.fetchall(): links.get("%s?limit=%d" % (l[1], _['links_per_page'])) cur.execute("update crawl_locations set last_crawled = now() where id = %s", (l[0],)) db.commit()
rJSON = f.read() f.close() try: links = json.loads(rJSON) except Exception, e: log.write('Error parsing links url: %s - %s' % (finalUrl, e), 'error') return after = links['data']['after'] for l in links['data']['children']: try: if l['kind'] == 't3': try: cur.execute("select id from t3 where id = %s", (lib.base36decode(l['data']['id']),)) if cur.rowcount > 0: cur.execute("update t3 set last_seen = now() where id = %s", (lib.base36decode(l['data']['id']),)) else: if l['data']['is_self']: content = l['data']['selftext'] else: content = None; cur.execute("""insert into t3 ( id, title, url, permalink, content, author, created, last_seen,
except Exception, e: log.write('Error checking login status: %e' %e, 'error') return rJSON = f.read() f.close() try: res = json.loads(rJSON) except Exception, e: log.write('Error parsing login status response: %s' % e, 'error') return if 'data' in res: opener.addheaders.append(('X-Modhash', res['data']['modhash'])) try: cur.execute("update session set modhash = %s", (res['data']['modhash'],)) db.commit() except Exception, e: log.write('Error updating modhash: %s' % e, 'error') return isLoggedIn = True def login(): global isLoggedIn log.write('Logging in user %s' % _['reddit_username'], 'message') try: success = False for i in range(_['http_retries']):
from conf import _ from init import db, cur, opener import lib import log import stats import user import userfunctions quotedRE = re.compile("^>.*$", re.I | re.M) for i in range(len(_["rules"])): if "regex" in _["rules"][i]: _["rules"][i]["re"] = re.compile(_["rules"][i]["regex"], re.I | re.M) cur.execute("select distinct thing_id from responses") responses = cur.fetchall() rcount = 0 def processComment(cid, body, author): for rule in _["rules"]: if "flags" in rule and "ignoreQuotedText" in rule["flags"]: body = re.sub(quotedRE, "", body) if "flags" not in rule or ("flags" in rule and "selftextOnly" not in rule["flags"]): if "user_function" in rule: try: getattr(userfunctions, rule["user_function"])(cid, body, author) except Exception, e: