def dump_submission_info(sub_path): start = time.time() # Log useful info num_submissions = 7333071 counter = 0 num_dumps = 0 num_urls_found = 0 num_xposts_found = 0 # Will store chunks of data to write and cleared at certain intervals # to avoid too much I/O. data_buffer = [] # Collect submissions with open(sub_path) as submission_file: for submission_raw in submission_file: submission = json.loads(submission_raw) # Get year, month, day tmstp = submission.get("created") if type(tmstp) == int: date = datetime.fromtimestamp(tmstp) elif type(tmstp) == datetime: date = tmstp else: print "WARNING::::different date format found", type(tmstp), tmstp continue try: sid = submission.get("subreddit_id").values()[0] subreddit = subreddit_ids[sid] except KeyError: print "WARNING::::subreddit id not found:", submission continue year = str(date.year) month = str(date.month) day = str(date.day) path = os.path.join(subreddit[0].lower(), subreddit, year, month, day) make_dir(path) # Extract relevant pieces of information submission_id = str(submission.get("_id").values()[0]) submission_title = submission.get("submission_title") or '' submission_text = submission.get("submission_text") or '' submission_prawid = submission.get("praw_id") or '-1' # Create information to write to file submission_info = submission_id + "\t" + submission_prawid + "\t" + submission_title + " " + submission_text url_info = submission.get("url") or '' xpost_info = utils.extract_subreddit_xpost(submission_title) or utils.get_internal_link(url_info) # Create data object and store in buffer data = {'submissions': (submission_info + '\n').encode("utf8")} if url_info: data['urls'] = (submission_id + '\t' + url_info + '\n').encode("utf8") if xpost_info: data['xposts'] = (submission_id + '\t' + xpost_info + '\n').encode("utf8") data_buffer.append((path, data)) # Write data and clear buffer every <chunk_size> submissions if len(data_buffer) >= chunk_size: filehandlers = {} for path, data in data_buffer: try: # See if file handlers have already been initialized fhs = filehandlers[path] for fh_type, fh in fhs.iteritems(): if fh_type in data: fh.write(data[fh_type]) except KeyError: # Otherwise create the file handlers and write to them fhs = {'submissions': open_file(path, 'submissions.txt')} fhs['submissions'].write(data['submissions']) if data.get('urls'): fhs['urls'] = open_file(path, 'urls.txt') fhs['urls'].write(data['urls']) if data.get('xposts'): fhs['xposts'] = open_file(path, 'xposts.txt') fhs['xposts'].write(data['xposts']) filehandlers[path] = fhs # Now close all file handlers for path, fhs in filehandlers.iteritems(): for fh_type, fh in fhs.iteritems(): fh.close() # Clear buffer data_buffer = [] num_dumps += 1 if counter % log_interval == 0: print "Progress:", counter, "submissions dumped out of", num_submissions, (counter / float(num_submissions)) * 100, "%" print "\tNumber of URLs found so far:", num_urls_found print "\tNumber of xposts found so far:", num_xposts_found print "\tNumber of dumps to file so far:", num_dumps print "\tTime spent:", (time.time() - start) / 60.0, "minutes" print "" counter += 1
subdomains[subdomain] = {subreddit: 1} num_subdomains += 1 # Attempt to extrapolate domain except AttributeError: domain = submission.get("domain") commenturl = submission.get("comment_url") url = submission.get("url") # Try looking at praw's domain extraction if domain and domain.startswith("self."): subreddit = domain.split(".")[1] num_extrapolated += 1 elif domain == "reddit.com" and url: subreddit = utils.get_internal_link(url) num_extrapolated += 1 # Otherwise look at comment url for hints elif commenturl: subreddit = utils.get_internal_link(commenturl) num_extrapolated += 1 else: subreddit = None num_not_found += 1 # If domain was found, can't infer subdomain, just try to find # xpost in title xpost = utils.extract_subreddit_xpost(submission.get("submission_title"))
def dump_wayback_submission_info(sub_path): start = time.time() # Log useful info num_submissions = 18609213 counter = 0 num_dumps = 0 num_urls_found = 0 num_xposts_found = 0 num_domains_found = 0 num_no_date = 0 # Will store chunks of data to write, cleared at certain intervals # to avoid too much I/O. data_buffer = {} # Will store chunks of data specifically for comment file, used to # scrape later comments. comment_file = open(os.path.join('..', 'data', 'reddit', 'wayback_comments_to_scrape.txt'), 'w') comment_buffer = [] # Collect submissions with open(sub_path) as submission_file: for submission_raw in submission_file: submission = json.loads(submission_raw) # Get year, month, day try: tmstp = submission.get("created").values()[0] except AttributeError: print "WARNING::::no timestamp?", submission.get("created") continue if type(tmstp) == int: date = datetime.fromtimestamp(tmstp / 1000) elif type(tmstp) == datetime: date = tmstp else: num_no_date += 1 continue if submission.get("comment_url"): subreddit = utils.get_internal_link(submission.get("comment_url")) if not subreddit: continue # Ignore ones that don't have associated subreddit else: continue year = str(date.year) month = str(date.month) day = str(date.day) path = os.path.join(subreddit[0].lower(), subreddit, year, month, day) make_dir(path) # Extract relevant pieces of information submission_id = str(submission.get("_id").values()[0]) submission_title = submission.get("submission_title") or '' submission_prawid = submission.get("reddit_id") or '-1' # Create information to write to file submission_info = submission_id + "\t" + submission_prawid + "\t" + submission_title url_info = submission.get("url") or '' domain_info = submission.get("domain") # Get potential xpost info xpost_info = utils.extract_subreddit_xpost(submission_title) if not xpost_info: if domain_info.startswith("self."): internal = domain_info.split(".")[1] if internal != subreddit: xpost_info = internal # Create data object and store in buffer data = {'submissions': (submission_info + '\n').encode("utf8")} if url_info: data['urls'] = (submission_id + '\t' + url_info + '\n').encode("utf8") num_urls_found += 1 if xpost_info: data['xposts'] = (submission_id + '\t' + xpost_info + '\n').encode("utf8") num_xposts_found += 1 if domain_info: data['domains'] = (submission_id + '\t' + domain_info + '\n').encode("utf8") num_domains_found += 1 try: data_buffer[path].append(data) except KeyError: data_buffer[path] = [data] # Store information about comments for later scraping if submission_prawid != '-1': comment_buffer.append((path, submission_prawid)) # Write data and clear buffer every <chunk_size> submissions if len(data_buffer) >= chunk_size: # Only ever open 5 files at a time for path, data in data_buffer.iteritems(): # Open files associated with this path filehandlers = {'submissions': open_file(path, 'submissions.txt'), 'urls': open_file(path, 'urls.txt'), 'xposts': open_file(path, 'xposts.txt'), 'domains': open_file(path, 'domains.txt')} # Write data to files for data_item in data: if data_item.get('urls'): filehandlers['urls'].write(data_item['urls']) if data_item.get('xposts'): filehandlers['xposts'].write(data_item['xposts']) if data_item.get('domains'): filehandlers['domains'].write(data_item['domains']) # Now close files filehandlers['submissions'].close() filehandlers['urls'].close() filehandlers['xposts'].close() filehandlers['domains'].close() # Update comment file for path, pid in comment_buffer: comment_file.write(path + '\t' + pid + '\n') # Clear buffer data_buffer = {} comment_buffer = [] num_dumps += 1 counter += 1 if counter % log_interval == 0: print "Wayback Progress:", counter, "submissions dumped out of", num_submissions, (counter / float(num_submissions)) * 100, "%" print "\tNumber of URLs found so far:", num_urls_found print "\tNumber of xposts found so far:", num_xposts_found print "\tNumber of dumps to file so far:", num_dumps print "\tNumber with no date:", num_no_date time_spent = (time.time() - start) / 60.0 print "\tTime spent:", time_spent, "minutes" print "\tExpected time remaining:", (num_submissions - counter) / (counter / time_spent) print "" comment_file.close()