Example #1
0
def posts2csv(post_f, authors=None, subreddits=None, seen_posts = set(), verbose=True, limit = 1000):
    reddit = Reddit(MongoClient('mongodb://127.0.0.1:27017')["reddit"])
    
    subreddits = [reddit.get_subreddit(s) for s in subreddits]
    authors = [reddit.get_user(a) for a in authors]

    subredditset = set()

    # subreddit info doesn't seem to have the "subreddit_id".   To do : get that with r/subreddit/<name>/about
    # for now, use subreddit name as forum identifier
    csvp = csv.writer(post_f)
    csvp.writerow("id,replyto,username,user_annotation_flairtext,annotation_over18,annotation_score,forum,discourse,title,when,dataset_file,post".split(","))

    for subreddit in subreddits:
        print(subreddit.name)
        postids = set(subreddit.post_ids) - seen_posts
        for i, idd in enumerate(postids):
            post = subreddit.post(idd)
            if i%1000 == 999: print("post",i,"of",len(postids),limit,"to go")
            if "selftext" not in post or post["selftext"] == "": continue   # Skip URL-only posts
            if "subreddit" not in post:
                print("No subreddit in post " + post["id"])
                continue
            if post["id"] in seen_posts: continue
            csvp.writerow([post["id"],None,post["author"],post["author_flair_text"],str(post["over_18"]),str(post["score"]),
                           post["subreddit"],"Reddit",post["title"],
                           datetime.fromtimestamp(post["created"], tz).isoformat(),
                           "reddit",post.get("selftext",post["url"])])
            limit -= 1
            if limit == 0: return

    for author in authors:
        print(author.name)
        postids = set(author.post_ids) - seen_posts
        for i,post in enumerate([author.post(id) for id in postids]):
            if i%1000 == 999: print("post",i,"of",len(postids),limit,"to go")
            if "selftext" not in post or post["selftext"] == "": continue   # Skip URL-only posts
            if "subreddit" not in post:
                print("No subreddit in post " + post["id"])
                continue
            if post["id"] in seen_posts: continue
            csvp.writerow([post["id"],None,post["author"],post["author_flair_text"],str(post["over_18"]),str(post["score"]),
                           post["subreddit"],"Reddit",post["title"],
                           datetime.fromtimestamp(post["created"], tz).isoformat(),
                           "reddit",post.get("selftext",post["url"])])
            limit -= 1
            if limit == 0: return