def download_the_whole_subreddit(storage_dir, subreddit_name, ts_interval, largest_timestamp): mkpath(storage_dir, mode=0755) r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme') if largest_timestamp is None: largest_timestamp = int(time.time()) + 12*3600 cts2 = largest_timestamp cts1 = largest_timestamp - ts_interval current_ts_interval = ts_interval while True: try: search_results = list(r.search('timestamp:{}..{}'.format(cts1, cts2), subreddit=subreddit_name, syntax='cloudsearch')) except Exception as e: logging.exception(e) continue logging.debug("Got {} submissions in interval {}..{}".format(len(search_results), cts1, cts2)) if len(search_results) == 25: current_ts_interval /= 2 cts1 = cts2 - current_ts_interval logging.debug("Reducing ts interval to {}".format(current_ts_interval)) continue for submission in search_results: submission.replace_more_comments(limit=None) save_submission(submission, storage_dir) cts2 = cts1 cts1 = cts2 - current_ts_interval if cts1 < 0: break if len(search_results) <= 7: current_ts_interval *= 2 logging.debug("Increasing ts interval to {}".format(current_ts_interval))
def crawl_continuously(storage_dir): r = praw.Reddit(user_agent='searcheng by /u/mansa503', client_id='XeYpcQ3WK7ST9A', client_secret='O4SYscjZbsg5m0EU31caz1SbeiE') for s in submission_stream(r, "learnprogramming"): #submission.replace_more_comments(limit=None, threshold=0) s.replace_more_comments(limit=None, threshold=0) save_submission(s, storage_dir)
def get_as_much_stuff_as_possible(storage_dir): mkpath(storage_dir, mode=0755) r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme') for method_name in ["get_hot", "get_new", "get_top_from_all", "get_top_from_week", "get_top_from_month", "get_top_from_year", "get_top_from_day", "get_top_from_hour"]: method = getattr(r.get_subreddit('learnprogramming'), method_name) submissions = method(limit=1000) for s in submissions: save_submission(s, storage_dir)
def get_as_much_stuff_as_possible(storage_dir): mkpath(storage_dir, mode=0755) r = praw.Reddit(user_agent='searcheng by /u/mansa503', client_id='XeYpcQ3WK7ST9A', client_secret='O4SYscjZbsg5m0EU31caz1SbeiE') for method_name in ["hot", "new"]: method = getattr(r.subreddit('learnprogramming'), method_name) submissions = method(limit=1000) for s in submissions: save_submission(s, storage_dir)
def get_as_much_stuff_as_possible(storage_dir): mkpath(storage_dir, mode=0755) r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme') for method_name in [ "get_hot", "get_new", "get_top_from_all", "get_top_from_week", "get_top_from_month", "get_top_from_year", "get_top_from_day", "get_top_from_hour" ]: method = getattr(r.get_subreddit('learnprogramming'), method_name) submissions = method(limit=1000) for s in submissions: save_submission(s, storage_dir)
def download_the_whole_subreddit(storage_dir, subreddit_name, ts_interval, largest_timestamp): mkpath(storage_dir, mode=0755) r = praw.Reddit(user_agent='searcheng by /u/mansa503', client_id='XeYpcQ3WK7ST9A', client_secret='O4SYscjZbsg5m0EU31caz1SbeiE') if largest_timestamp is None: largest_timestamp = int(time.time()) + 12 * 3600 cts2 = largest_timestamp cts1 = largest_timestamp - ts_interval current_ts_interval = ts_interval while True: try: search_results = list( r.subreddit('learnprogramming').search( 'timestamp:{}..{}'.format(cts1, cts2), syntax='cloudsearch')) except Exception as e: logging.exception(e) continue logging.debug("Got {} submissions in interval {}..{}".format( len(search_results), cts1, cts2)) if len(search_results) == 25: current_ts_interval /= 2 cts1 = cts2 - current_ts_interval logging.debug( "Reducing ts interval to {}".format(current_ts_interval)) continue for submission in search_results: submission.replace_more_comments(limit=None) save_submission(submission, storage_dir) cts2 = cts1 cts1 = cts2 - current_ts_interval if cts1 < 0: break if len(search_results) <= 7: current_ts_interval *= 2 logging.debug( "Increasing ts interval to {}".format(current_ts_interval))
def crawl_continuously(storage_dir): r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme') for s in submission_stream(r, "learnprogramming"): save_submission(s, storage_dir)
def crawl_continuously(storage_dir): r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme') for s in submission_stream(r, "learnprogramming"): s.replace_more_comments(limit=None) save_submission(s, storage_dir)