Esempio n. 1
0
def build(crawl_subreddits, crawl_urls):
    urls = []

    log.write("Building location list...", 'message')

    for subreddit in crawl_subreddits:
        for sort in subreddit['sort']:
            if sort == "all": sort = ""
            urls.append("http://www.reddit.com/r/" + subreddit['subreddit'] + "/" + sort + ".json")

    for url in crawl_urls:
        urls.append(url + ".json")

    for url in urls:
        try:
            cur.execute("select id from crawl_locations where url = %s", (url,))
            if cur.rowcount > 0:
                cur.execute("update crawl_locations set last_seen = now() where url = %s", (url,))
            else:
                cur.execute("""insert into crawl_locations (
                                url,
                                last_seen,
                                last_crawled
                            ) values (%s, now(), 0)""", (
                                url
                            ))
            db.commit()

        except Exception, e:
            log.write('Error storing location: ' + url + ': %s' % e, 'error')
            db.rollback()
            
Esempio n. 2
0
def getCommentTree(nodes, url, linkid, commentid, args, depth):
    global ccount

    for node in nodes:
        try:
            if node is None:
                break

            elif node['kind'] == 't1':
                try:
                    cur.execute("""replace into t1 (
                                    id,
                                    link_id,
                                    parent_id,
                                    body, 
                                    author,
                                    created,
                                    last_seen
                                ) values (%s, %s, %s, %s, %s, %s, now())""", (
                                    lib.base36decode(node['data']['id']), 
                                    node['data']['link_id'],
                                    node['data']['parent_id'], 
                                    node['data']['body'], 
                                    node['data']['author'], 
                                    datetime.datetime.fromtimestamp(node['data']['created_utc'])
                                ))
                    db.commit()
                    ccount += 1

                    if node['data']['replies'] != "":
                        getCommentTree([node['data']['replies']], url, linkid, commentid, args, depth)

                except Exception, e:
                    log.write('Error storing t1_' + node['data']['id'] + ': %s' % e, 'error')
                    db.rollback()

            elif node['kind'] == "Listing":
                getCommentTree(node['data']['children'], url, linkid, commentid, args, depth)

            elif node['kind'] == "more":
                if _['autoget_lte_20'] and node['data']['count'] <= 20 and node['data']['count'] >= _['autoget_threshold']:
                    children = ",".join(node['data']['children'])
                    time.sleep(_['sleep'])
                    get('http://www.reddit.com/api/morechildren/', linkid, "", "api_type=json&depth=8&link_id=%s&children=%s" % (linkid, children), 0, True)

                elif node['data']['count'] >= _['comment_traverse_threshold']:
                    if node['data']['parent_id'] == linkid or node['data']['parent_id'] == commentid:
                        #sibling traversal
                        breadth = 0
                        for child in node['data']['children']:
                            if breadth >= _['comment_siblings_total']:
                                break
                            time.sleep(_['sleep'])
                            get(url, linkid, child, args, depth)
                            breadth += 1
                    else:
                        #child traversal
                        time.sleep(_['sleep'])
                        get(url, linkid, node['data']['parent_id'][3:], args, depth + 1)
Esempio n. 3
0
                                            created,
                                            last_seen,
                                            last_crawled
                                        ) values (%s, %s, %s, %s, %s, %s, %s, now(), 0)""", (
                                            lib.base36decode(l['data']['id']), 
                                            l['data']['title'], 
                                            l['data']['url'], 
                                            l['data']['permalink'],
                                            content,
                                            l['data']['author'],
                                            datetime.datetime.fromtimestamp(l['data']['created_utc'])
                                        ))
                        db.commit()

                    except Exception, e:
                        log.write('Error storing t3_' + l['data']['id'] + ': %s' % e, 'error')
                        db.rollback()

            except Exception, e:
                log.write('Error checking links file node type: %s' % e, 'error')

        #endfor l in links

        stats.linkTimes['counts'].append(len(links['data']['children']))
        stats.linkTimes['times'].append(time.time() - start)

        time.sleep(_['sleep'])

    #endfor p in pages