Beispiel #1
0
def getCommentTree(nodes, url, linkid, commentid, args, depth):
    global ccount

    for node in nodes:
        try:
            if node is None:
                break

            elif node['kind'] == 't1':
                try:
                    cur.execute("""replace into t1 (
                                    id,
                                    link_id,
                                    parent_id,
                                    body, 
                                    author,
                                    created,
                                    last_seen
                                ) values (%s, %s, %s, %s, %s, %s, now())""", (
                                    lib.base36decode(node['data']['id']), 
                                    node['data']['link_id'],
                                    node['data']['parent_id'], 
                                    node['data']['body'], 
                                    node['data']['author'], 
                                    datetime.datetime.fromtimestamp(node['data']['created_utc'])
                                ))
                    db.commit()
                    ccount += 1

                    if node['data']['replies'] != "":
                        getCommentTree([node['data']['replies']], url, linkid, commentid, args, depth)

                except Exception, e:
                    log.write('Error storing t1_' + node['data']['id'] + ': %s' % e, 'error')
                    db.rollback()

            elif node['kind'] == "Listing":
                getCommentTree(node['data']['children'], url, linkid, commentid, args, depth)

            elif node['kind'] == "more":
                if _['autoget_lte_20'] and node['data']['count'] <= 20 and node['data']['count'] >= _['autoget_threshold']:
                    children = ",".join(node['data']['children'])
                    time.sleep(_['sleep'])
                    get('http://www.reddit.com/api/morechildren/', linkid, "", "api_type=json&depth=8&link_id=%s&children=%s" % (linkid, children), 0, True)

                elif node['data']['count'] >= _['comment_traverse_threshold']:
                    if node['data']['parent_id'] == linkid or node['data']['parent_id'] == commentid:
                        #sibling traversal
                        breadth = 0
                        for child in node['data']['children']:
                            if breadth >= _['comment_siblings_total']:
                                break
                            time.sleep(_['sleep'])
                            get(url, linkid, child, args, depth)
                            breadth += 1
                    else:
                        #child traversal
                        time.sleep(_['sleep'])
                        get(url, linkid, node['data']['parent_id'][3:], args, depth + 1)
Beispiel #2
0
        rJSON = f.read()
        f.close()

        try: links = json.loads(rJSON)
        except Exception, e:
            log.write('Error parsing links url: %s - %s' % (finalUrl, e), 'error')
            return

        after = links['data']['after']

        for l in links['data']['children']:
            try:
                if l['kind'] == 't3':
                    try:
                        cur.execute("select id from t3 where id = %s", (lib.base36decode(l['data']['id']),))
                        if cur.rowcount > 0:
                            cur.execute("update t3 set last_seen = now() where id = %s", (lib.base36decode(l['data']['id']),))
                        else:
                            if l['data']['is_self']: content = l['data']['selftext']
                            else: content = None;
                            
                            cur.execute("""insert into t3 (
                                            id, 
                                            title, 
                                            url, 
                                            permalink, 
                                            content,
                                            author,
                                            created,
                                            last_seen,