def testPostfixNotation(self, raw_comment, symbol): """Spoilers of the form: [Here be spoilers](#spoiler)`""" raw_comment["body"] = f"Heyho th(er)e# are some " \ f"[spoiler text]({symbol}) ['link'](text)" comment = Comment.from_raw(raw_comment) assert len(comment.spoilers()) == 1 assert comment.spoilers()[0].text == "spoiler text" assert comment.spoilers()[0].topic is None
def testTagged(self, raw_comment, symbol): raw_comment[ "body"] = f"""Lets.topic some [Tag]({symbol} spoiler) spoilers /s.""" comment = Comment.from_raw(raw_comment) spoilers = comment.spoilers() assert len(spoilers) == 1 assert spoilers[0].text == "spoiler" assert spoilers[0].topic == "Tag"
def testNewNotation(self, raw_comment): """Spoilers of the form: `>!Here be spoilers!<`""" raw_comment["body"] = "Heyho th(er)e are some (spoilers)[asdf] here" \ ">!spoiler text!<" comment = Comment.from_raw(raw_comment) assert len(comment.spoilers()) == 1 assert comment.spoilers()[0].text == "spoiler text" assert comment.spoilers()[0].topic == None
def testTaggedMultiple(self, raw_comment, symbol): raw_comment[ "body"] = f"""Lets.topic some [Tag]({symbol} "spoiler") spoilers [Tag2](/s more spoiler) /s.""" comment = Comment.from_raw(raw_comment) spoilers = comment.spoilers() assert len(spoilers) == 2 assert spoilers[0].text == "spoiler" assert spoilers[0].topic == "Tag" assert spoilers[1].text == "more spoiler" assert spoilers[1].topic == "Tag2"
def testPostfixNotationMultiple(self, raw_comment, symbol): raw_comment["body"] = f"""Heyho [some]({symbol})th(er)e# are some [spoiler text]({symbol}) ['link'](text)""" comment = Comment.from_raw(raw_comment) spoilers = comment.spoilers() assert len(spoilers) == 2 assert spoilers[0].text == "some" assert spoilers[0].topic is None assert spoilers[1].text == "spoiler text" assert spoilers[1].topic is None
def testNewNotationMultiple(self, raw_comment): """Spoilers of the form: `>!Here be spoilers!<`""" raw_comment[ "body"] = "Heyho t<here are >!some!< spoilers here >!spoiler text!<" comment = Comment.from_raw(raw_comment) assert len(comment.spoilers()) == 2 assert comment.spoilers()[0].text == "some" assert comment.spoilers()[0].topic is None assert comment.spoilers()[1].text == "spoiler text" assert comment.spoilers()[1].topic is None
def test_conversion(self, raw_comment): comment = Comment.from_raw(raw_comment) assert Comment.from_row(comment.to_row()) == comment
def testParsing(self, raw_comment): comment = Comment.from_raw(raw_comment) assert comment.id == int("666", base=36) assert comment.parent_comment_id == int("duzm4vy", base=36)
def main(args): session = util.build_session(name="Reddit Subreddit Counts") all_comments = Comment.load_comments(session, path=args.comments_path)\ .persist(StorageLevel.DISK_ONLY) duplicate_ids = get_duplicate_ids(all_comments) duplicate_ids.write.json( "duplicates-test-md5-%s.csv" % (session.sparkContext.applicationId) ) comments = all_comments.join( duplicate_ids, duplicate_ids.id == all_comments.id, "left_anti" ).persist(StorageLevel.DISK_ONLY) all_comments.unpersist() spoiler_comments = comments.filter(comments.contains_spoiler == True).persist(StorageLevel.MEMORY_AND_DISK) posts = Post.load_posts(session, path=args.posts_path).persist(StorageLevel.DISK_ONLY) whitelist = [line.strip() for line in args.whitelist.readlines()] non_spoiler_post_ids = posts.filter(posts.spoiler == False)\ .filter(posts.over_18 == False)\ .filter(~(posts.title.contains("Spoiler") | posts.title.contains("spoiler")))\ .filter(" or ".join(["subreddit == '%s'" % s for s in whitelist]))\ .select("id")\ .persist(StorageLevel.DISK_ONLY) whitelisted_spoiler_comments = spoiler_comments\ .filter(comments.author != "AutoModerator")\ .filter(" or ".join(["subreddit == '%s'" % s for s in whitelist]))\ .persist(StorageLevel.MEMORY_AND_DISK) if "statistics" in args.collect: comment_spoilers_per_month_and_sub(session, spoiler_comments) save_comment_per_month(session, spoiler_comments, base_name="spoilers_comments_per_month") save_comment_per_month(session, comments, base_name="total_comments-per_month") if "spoiler_comments" in args.collect: if len(whitelist) == 0: print("Error, whitelist has 0 elements") spoiler_comments_without_spoiler_posts = whitelisted_spoiler_comments\ .join(non_spoiler_post_ids, whitelisted_spoiler_comments.post_id == non_spoiler_post_ids.id)\ .drop(non_spoiler_post_ids.id) spoiler_comments_without_spoiler_posts\ .write.json("reddit/spoiler-comments-%s.csv" % session.sparkContext.applicationId) if "non_spoiler_comments" in args.collect: spoiler_counts_per_sub = spoiler_comments_without_spoiler_posts.groupby("subreddit")\ .count()\ .collect() spoiler_counts_per_sub = {row["subreddit"]: row["count"] for row in spoiler_counts_per_sub} print(spoiler_counts_per_sub) non_spoiler_comments = comments\ .filter(~(col("text").like("()[%]")))\ .filter(comments.distinguished.isNull())\ .filter(comments.score >= 3)\ .filter((comments.text != "[deleted]") | (comments.author != "[deleted]"))\ .filter((comments.text != "[removed]") | (comments.author != "[deleted]"))\ .filter(comments.author != "AutoModerator")\ .filter(comments.contains_spoiler == False)\ .filter(" or ".join(["subreddit == '%s'" % s for s in whitelist]))\ .join(non_spoiler_post_ids, comments.post_id == non_spoiler_post_ids.id)\ .drop(non_spoiler_post_ids.id)\ .persist(StorageLevel.DISK_ONLY) for subreddit, spoiler_count in spoiler_counts_per_sub.items(): subreddit_non_spoilers = non_spoiler_comments\ .filter(non_spoiler_comments.subreddit == subreddit)\ .persist(StorageLevel.MEMORY_AND_DISK) non_spoiler_count = subreddit_non_spoilers.count() # Due to this sampling we are not guaranteed to get the exact same counts if spoiler_count > non_spoiler_count: print( "[Warning] We will not be able to sample enough non-spoilers for %s (spoilers: %d, non-spoilers: %d)" % (subreddit, spoiler_count, non_spoiler_count) ) fraction = 1.0 else: fraction = spoiler_count / non_spoiler_count sampled = subreddit_non_spoilers.sample(fraction=fraction, seed=42) sampled.write\ .json("reddit/non-spoilers-%s.csv" % session.sparkContext.applicationId, mode="append") subreddit_non_spoilers.unpersist()