コード例 #1
0
def download_the_whole_subreddit(storage_dir, subreddit_name, ts_interval, largest_timestamp):
    mkpath(storage_dir, mode=0755)
    r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme')
    if largest_timestamp is None:
        largest_timestamp = int(time.time()) + 12*3600
    cts2 = largest_timestamp
    cts1 = largest_timestamp - ts_interval
    current_ts_interval = ts_interval
    while True:
        try:
            search_results = list(r.search('timestamp:{}..{}'.format(cts1, cts2), subreddit=subreddit_name, syntax='cloudsearch'))
        except Exception as e:
            logging.exception(e)
            continue

        logging.debug("Got {} submissions in interval {}..{}".format(len(search_results), cts1, cts2))
        if len(search_results) == 25:
            current_ts_interval /= 2
            cts1 = cts2 - current_ts_interval
            logging.debug("Reducing ts interval to {}".format(current_ts_interval))
            continue

        for submission in search_results:
            submission.replace_more_comments(limit=None)
            save_submission(submission, storage_dir)

        cts2 = cts1
        cts1 = cts2 - current_ts_interval

        if cts1 < 0:
            break

        if len(search_results) <= 7:
            current_ts_interval *= 2
            logging.debug("Increasing ts interval to {}".format(current_ts_interval))
コード例 #2
0
def crawl_continuously(storage_dir):
    r = praw.Reddit(user_agent='searcheng by /u/mansa503',
                    client_id='XeYpcQ3WK7ST9A',
                    client_secret='O4SYscjZbsg5m0EU31caz1SbeiE')
    for s in submission_stream(r, "learnprogramming"):
        #submission.replace_more_comments(limit=None, threshold=0)
        s.replace_more_comments(limit=None, threshold=0)
        save_submission(s, storage_dir)
コード例 #3
0
def get_as_much_stuff_as_possible(storage_dir):
    mkpath(storage_dir, mode=0755)
    r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme')
    for method_name in ["get_hot", "get_new", "get_top_from_all", "get_top_from_week",
                        "get_top_from_month", "get_top_from_year", "get_top_from_day",
                        "get_top_from_hour"]:
        method = getattr(r.get_subreddit('learnprogramming'), method_name)
        submissions = method(limit=1000)
        for s in submissions:
            save_submission(s, storage_dir)
コード例 #4
0
def get_as_much_stuff_as_possible(storage_dir):
    mkpath(storage_dir, mode=0755)
    r = praw.Reddit(user_agent='searcheng by /u/mansa503',
                    client_id='XeYpcQ3WK7ST9A',
                    client_secret='O4SYscjZbsg5m0EU31caz1SbeiE')
    for method_name in ["hot", "new"]:
        method = getattr(r.subreddit('learnprogramming'), method_name)
        submissions = method(limit=1000)
        for s in submissions:
            save_submission(s, storage_dir)
コード例 #5
0
def get_as_much_stuff_as_possible(storage_dir):
    mkpath(storage_dir, mode=0755)
    r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme')
    for method_name in [
            "get_hot", "get_new", "get_top_from_all", "get_top_from_week",
            "get_top_from_month", "get_top_from_year", "get_top_from_day",
            "get_top_from_hour"
    ]:
        method = getattr(r.get_subreddit('learnprogramming'), method_name)
        submissions = method(limit=1000)
        for s in submissions:
            save_submission(s, storage_dir)
コード例 #6
0
def download_the_whole_subreddit(storage_dir, subreddit_name, ts_interval,
                                 largest_timestamp):
    mkpath(storage_dir, mode=0755)
    r = praw.Reddit(user_agent='searcheng by /u/mansa503',
                    client_id='XeYpcQ3WK7ST9A',
                    client_secret='O4SYscjZbsg5m0EU31caz1SbeiE')
    if largest_timestamp is None:
        largest_timestamp = int(time.time()) + 12 * 3600
    cts2 = largest_timestamp
    cts1 = largest_timestamp - ts_interval
    current_ts_interval = ts_interval
    while True:
        try:
            search_results = list(
                r.subreddit('learnprogramming').search(
                    'timestamp:{}..{}'.format(cts1, cts2),
                    syntax='cloudsearch'))
        except Exception as e:
            logging.exception(e)
            continue

        logging.debug("Got {} submissions in interval {}..{}".format(
            len(search_results), cts1, cts2))
        if len(search_results) == 25:
            current_ts_interval /= 2
            cts1 = cts2 - current_ts_interval
            logging.debug(
                "Reducing ts interval to {}".format(current_ts_interval))
            continue

        for submission in search_results:
            submission.replace_more_comments(limit=None)
            save_submission(submission, storage_dir)

        cts2 = cts1
        cts1 = cts2 - current_ts_interval

        if cts1 < 0:
            break

        if len(search_results) <= 7:
            current_ts_interval *= 2
            logging.debug(
                "Increasing ts interval to {}".format(current_ts_interval))
コード例 #7
0
def crawl_continuously(storage_dir):
    r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme')
    for s in submission_stream(r, "learnprogramming"):
        save_submission(s, storage_dir)
コード例 #8
0
def crawl_continuously(storage_dir):
    r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme')
    for s in submission_stream(r, "learnprogramming"):
        s.replace_more_comments(limit=None)
        save_submission(s, storage_dir)