def build(crawl_subreddits, crawl_urls): urls = [] log.write("Building location list...", 'message') for subreddit in crawl_subreddits: for sort in subreddit['sort']: if sort == "all": sort = "" urls.append("http://www.reddit.com/r/" + subreddit['subreddit'] + "/" + sort + ".json") for url in crawl_urls: urls.append(url + ".json") for url in urls: try: cur.execute("select id from crawl_locations where url = %s", (url,)) if cur.rowcount > 0: cur.execute("update crawl_locations set last_seen = now() where url = %s", (url,)) else: cur.execute("""insert into crawl_locations ( url, last_seen, last_crawled ) values (%s, now(), 0)""", ( url )) db.commit() except Exception, e: log.write('Error storing location: ' + url + ': %s' % e, 'error') db.rollback()
def getCommentTree(nodes, url, linkid, commentid, args, depth): global ccount for node in nodes: try: if node is None: break elif node['kind'] == 't1': try: cur.execute("""replace into t1 ( id, link_id, parent_id, body, author, created, last_seen ) values (%s, %s, %s, %s, %s, %s, now())""", ( lib.base36decode(node['data']['id']), node['data']['link_id'], node['data']['parent_id'], node['data']['body'], node['data']['author'], datetime.datetime.fromtimestamp(node['data']['created_utc']) )) db.commit() ccount += 1 if node['data']['replies'] != "": getCommentTree([node['data']['replies']], url, linkid, commentid, args, depth) except Exception, e: log.write('Error storing t1_' + node['data']['id'] + ': %s' % e, 'error') db.rollback() elif node['kind'] == "Listing": getCommentTree(node['data']['children'], url, linkid, commentid, args, depth) elif node['kind'] == "more": if _['autoget_lte_20'] and node['data']['count'] <= 20 and node['data']['count'] >= _['autoget_threshold']: children = ",".join(node['data']['children']) time.sleep(_['sleep']) get('http://www.reddit.com/api/morechildren/', linkid, "", "api_type=json&depth=8&link_id=%s&children=%s" % (linkid, children), 0, True) elif node['data']['count'] >= _['comment_traverse_threshold']: if node['data']['parent_id'] == linkid or node['data']['parent_id'] == commentid: #sibling traversal breadth = 0 for child in node['data']['children']: if breadth >= _['comment_siblings_total']: break time.sleep(_['sleep']) get(url, linkid, child, args, depth) breadth += 1 else: #child traversal time.sleep(_['sleep']) get(url, linkid, node['data']['parent_id'][3:], args, depth + 1)
created, last_seen, last_crawled ) values (%s, %s, %s, %s, %s, %s, %s, now(), 0)""", ( lib.base36decode(l['data']['id']), l['data']['title'], l['data']['url'], l['data']['permalink'], content, l['data']['author'], datetime.datetime.fromtimestamp(l['data']['created_utc']) )) db.commit() except Exception, e: log.write('Error storing t3_' + l['data']['id'] + ': %s' % e, 'error') db.rollback() except Exception, e: log.write('Error checking links file node type: %s' % e, 'error') #endfor l in links stats.linkTimes['counts'].append(len(links['data']['children'])) stats.linkTimes['times'].append(time.time() - start) time.sleep(_['sleep']) #endfor p in pages