Exemple #1
0
 def testPostfixNotation(self, raw_comment, symbol):
     """Spoilers of the form: [Here be spoilers](#spoiler)`"""
     raw_comment["body"] = f"Heyho th(er)e# are some " \
                           f"[spoiler text]({symbol}) ['link'](text)"
     comment = Comment.from_raw(raw_comment)
     assert len(comment.spoilers()) == 1
     assert comment.spoilers()[0].text == "spoiler text"
     assert comment.spoilers()[0].topic is None
Exemple #2
0
 def testTagged(self, raw_comment, symbol):
     raw_comment[
         "body"] = f"""Lets.topic some [Tag]({symbol} spoiler) spoilers /s."""
     comment = Comment.from_raw(raw_comment)
     spoilers = comment.spoilers()
     assert len(spoilers) == 1
     assert spoilers[0].text == "spoiler"
     assert spoilers[0].topic == "Tag"
Exemple #3
0
 def testNewNotation(self, raw_comment):
     """Spoilers of the form: `>!Here be spoilers!<`"""
     raw_comment["body"] = "Heyho th(er)e are some (spoilers)[asdf] here" \
                           ">!spoiler text!<"
     comment = Comment.from_raw(raw_comment)
     assert len(comment.spoilers()) == 1
     assert comment.spoilers()[0].text == "spoiler text"
     assert comment.spoilers()[0].topic == None
Exemple #4
0
 def testTaggedMultiple(self, raw_comment, symbol):
     raw_comment[
         "body"] = f"""Lets.topic some [Tag]({symbol} "spoiler") spoilers [Tag2](/s more spoiler) /s."""
     comment = Comment.from_raw(raw_comment)
     spoilers = comment.spoilers()
     assert len(spoilers) == 2
     assert spoilers[0].text == "spoiler"
     assert spoilers[0].topic == "Tag"
     assert spoilers[1].text == "more spoiler"
     assert spoilers[1].topic == "Tag2"
Exemple #5
0
 def testPostfixNotationMultiple(self, raw_comment, symbol):
     raw_comment["body"] = f"""Heyho [some]({symbol})th(er)e# are
                             some [spoiler text]({symbol}) ['link'](text)"""
     comment = Comment.from_raw(raw_comment)
     spoilers = comment.spoilers()
     assert len(spoilers) == 2
     assert spoilers[0].text == "some"
     assert spoilers[0].topic is None
     assert spoilers[1].text == "spoiler text"
     assert spoilers[1].topic is None
Exemple #6
0
 def testNewNotationMultiple(self, raw_comment):
     """Spoilers of the form: `>!Here be spoilers!<`"""
     raw_comment[
         "body"] = "Heyho t<here are >!some!< spoilers here >!spoiler text!<"
     comment = Comment.from_raw(raw_comment)
     assert len(comment.spoilers()) == 2
     assert comment.spoilers()[0].text == "some"
     assert comment.spoilers()[0].topic is None
     assert comment.spoilers()[1].text == "spoiler text"
     assert comment.spoilers()[1].topic is None
Exemple #7
0
 def test_conversion(self, raw_comment):
     comment = Comment.from_raw(raw_comment)
     assert Comment.from_row(comment.to_row()) == comment
Exemple #8
0
 def testParsing(self, raw_comment):
     comment = Comment.from_raw(raw_comment)
     assert comment.id == int("666", base=36)
     assert comment.parent_comment_id == int("duzm4vy", base=36)
def main(args):
    session = util.build_session(name="Reddit Subreddit Counts")
    all_comments = Comment.load_comments(session, path=args.comments_path)\
        .persist(StorageLevel.DISK_ONLY)
    duplicate_ids = get_duplicate_ids(all_comments)
    duplicate_ids.write.json(
        "duplicates-test-md5-%s.csv" % (session.sparkContext.applicationId)
    )

    comments = all_comments.join(
        duplicate_ids, duplicate_ids.id == all_comments.id, "left_anti"
    ).persist(StorageLevel.DISK_ONLY)

    all_comments.unpersist()
    spoiler_comments = comments.filter(comments.contains_spoiler == True).persist(StorageLevel.MEMORY_AND_DISK)


    
    posts = Post.load_posts(session, path=args.posts_path).persist(StorageLevel.DISK_ONLY)
    whitelist = [line.strip() for line in args.whitelist.readlines()]

    non_spoiler_post_ids = posts.filter(posts.spoiler == False)\
        .filter(posts.over_18 == False)\
        .filter(~(posts.title.contains("Spoiler") | posts.title.contains("spoiler")))\
        .filter(" or ".join(["subreddit == '%s'" % s for s in whitelist]))\
        .select("id")\
        .persist(StorageLevel.DISK_ONLY)

    whitelisted_spoiler_comments = spoiler_comments\
        .filter(comments.author != "AutoModerator")\
        .filter(" or ".join(["subreddit == '%s'" % s for s in whitelist]))\
        .persist(StorageLevel.MEMORY_AND_DISK)

    if "statistics" in args.collect:
        comment_spoilers_per_month_and_sub(session, spoiler_comments)
        save_comment_per_month(session, spoiler_comments, base_name="spoilers_comments_per_month")
        save_comment_per_month(session, comments, base_name="total_comments-per_month")
    if "spoiler_comments" in args.collect:
        if len(whitelist) == 0:
            print("Error, whitelist has 0 elements")
        spoiler_comments_without_spoiler_posts = whitelisted_spoiler_comments\
                .join(non_spoiler_post_ids, whitelisted_spoiler_comments.post_id == non_spoiler_post_ids.id)\
                .drop(non_spoiler_post_ids.id)
        spoiler_comments_without_spoiler_posts\
                .write.json("reddit/spoiler-comments-%s.csv" % session.sparkContext.applicationId)
    if "non_spoiler_comments" in args.collect:
        spoiler_counts_per_sub = spoiler_comments_without_spoiler_posts.groupby("subreddit")\
            .count()\
            .collect()
        spoiler_counts_per_sub = {row["subreddit"]: row["count"] for row in spoiler_counts_per_sub}
        print(spoiler_counts_per_sub)

        non_spoiler_comments = comments\
            .filter(~(col("text").like("()[%]")))\
            .filter(comments.distinguished.isNull())\
            .filter(comments.score >= 3)\
            .filter((comments.text != "[deleted]") | (comments.author != "[deleted]"))\
            .filter((comments.text != "[removed]") | (comments.author != "[deleted]"))\
            .filter(comments.author != "AutoModerator")\
            .filter(comments.contains_spoiler == False)\
            .filter(" or ".join(["subreddit == '%s'" % s for s in whitelist]))\
            .join(non_spoiler_post_ids, comments.post_id == non_spoiler_post_ids.id)\
            .drop(non_spoiler_post_ids.id)\
            .persist(StorageLevel.DISK_ONLY)

        for subreddit, spoiler_count in spoiler_counts_per_sub.items():
            subreddit_non_spoilers = non_spoiler_comments\
                    .filter(non_spoiler_comments.subreddit == subreddit)\
                    .persist(StorageLevel.MEMORY_AND_DISK)
            non_spoiler_count = subreddit_non_spoilers.count()
            # Due to this sampling we are not guaranteed to get the exact same counts
            if spoiler_count > non_spoiler_count:
                print(
                    "[Warning] We will not be able to sample enough non-spoilers for %s (spoilers: %d, non-spoilers: %d)" %
                    (subreddit, spoiler_count, non_spoiler_count)
                )
                fraction = 1.0
            else:
                fraction = spoiler_count / non_spoiler_count
            sampled = subreddit_non_spoilers.sample(fraction=fraction, seed=42)
            sampled.write\
                    .json("reddit/non-spoilers-%s.csv" % session.sparkContext.applicationId, mode="append")
            subreddit_non_spoilers.unpersist()