def posts2csv(post_f, authors=None, subreddits=None, seen_posts = set(), verbose=True, limit = 1000): reddit = Reddit(MongoClient('mongodb://127.0.0.1:27017')["reddit"]) subreddits = [reddit.get_subreddit(s) for s in subreddits] authors = [reddit.get_user(a) for a in authors] subredditset = set() # subreddit info doesn't seem to have the "subreddit_id". To do : get that with r/subreddit/<name>/about # for now, use subreddit name as forum identifier csvp = csv.writer(post_f) csvp.writerow("id,replyto,username,user_annotation_flairtext,annotation_over18,annotation_score,forum,discourse,title,when,dataset_file,post".split(",")) for subreddit in subreddits: print(subreddit.name) postids = set(subreddit.post_ids) - seen_posts for i, idd in enumerate(postids): post = subreddit.post(idd) if i%1000 == 999: print("post",i,"of",len(postids),limit,"to go") if "selftext" not in post or post["selftext"] == "": continue # Skip URL-only posts if "subreddit" not in post: print("No subreddit in post " + post["id"]) continue if post["id"] in seen_posts: continue csvp.writerow([post["id"],None,post["author"],post["author_flair_text"],str(post["over_18"]),str(post["score"]), post["subreddit"],"Reddit",post["title"], datetime.fromtimestamp(post["created"], tz).isoformat(), "reddit",post.get("selftext",post["url"])]) limit -= 1 if limit == 0: return for author in authors: print(author.name) postids = set(author.post_ids) - seen_posts for i,post in enumerate([author.post(id) for id in postids]): if i%1000 == 999: print("post",i,"of",len(postids),limit,"to go") if "selftext" not in post or post["selftext"] == "": continue # Skip URL-only posts if "subreddit" not in post: print("No subreddit in post " + post["id"]) continue if post["id"] in seen_posts: continue csvp.writerow([post["id"],None,post["author"],post["author_flair_text"],str(post["over_18"]),str(post["score"]), post["subreddit"],"Reddit",post["title"], datetime.fromtimestamp(post["created"], tz).isoformat(), "reddit",post.get("selftext",post["url"])]) limit -= 1 if limit == 0: return