コード例 #1
0
def dump_submission_info(sub_path):

	start = time.time()

	# Log useful info
	num_submissions = 7333071

	counter = 0
	num_dumps = 0
	num_urls_found = 0
	num_xposts_found = 0

	# Will store chunks of data to write and cleared at certain intervals
	# to avoid too much I/O.
	data_buffer = []

	# Collect submissions
	with open(sub_path) as submission_file:
		for submission_raw in submission_file:

			submission = json.loads(submission_raw)

			# Get year, month, day
			tmstp = submission.get("created")
			if type(tmstp) == int:
				date = datetime.fromtimestamp(tmstp)
			elif type(tmstp) == datetime:
				date = tmstp
			else:
				print "WARNING::::different date format found", type(tmstp), tmstp
				continue

			try:
				sid = submission.get("subreddit_id").values()[0]
				subreddit = subreddit_ids[sid]
			except KeyError:
				print "WARNING::::subreddit id not found:", submission
				continue

			year = str(date.year)
			month = str(date.month)
			day = str(date.day)

			path = os.path.join(subreddit[0].lower(), subreddit, year, month, day)
			make_dir(path)

			# Extract relevant pieces of information
			submission_id = str(submission.get("_id").values()[0])
			submission_title = submission.get("submission_title") or ''
			submission_text = submission.get("submission_text") or ''
			submission_prawid = submission.get("praw_id") or '-1'

			# Create information to write to file
			submission_info = submission_id + "\t" + submission_prawid + "\t" + submission_title + " " + submission_text
			url_info = submission.get("url") or ''
			xpost_info = utils.extract_subreddit_xpost(submission_title) or utils.get_internal_link(url_info)

			# Create data object and store in buffer
			data = {'submissions': (submission_info + '\n').encode("utf8")}
			if url_info:
				data['urls'] = (submission_id + '\t' + url_info + '\n').encode("utf8")
			if xpost_info:
				data['xposts'] = (submission_id + '\t' + xpost_info + '\n').encode("utf8")

			data_buffer.append((path, data))

			# Write data and clear buffer every <chunk_size> submissions
			if len(data_buffer) >= chunk_size:

				filehandlers = {}
				for path, data in data_buffer:
					try:
						# See if file handlers have already been initialized
						fhs = filehandlers[path]
						for fh_type, fh in fhs.iteritems():
							if fh_type in data:
								fh.write(data[fh_type])
					except KeyError:
						# Otherwise create the file handlers and write to them
						fhs = {'submissions': open_file(path, 'submissions.txt')}
						fhs['submissions'].write(data['submissions'])
						if data.get('urls'):
							fhs['urls'] = open_file(path, 'urls.txt')
							fhs['urls'].write(data['urls'])
						if data.get('xposts'):
							fhs['xposts'] = open_file(path, 'xposts.txt')
							fhs['xposts'].write(data['xposts'])
						filehandlers[path] = fhs

				# Now close all file handlers
				for path, fhs in filehandlers.iteritems():
					for fh_type, fh in fhs.iteritems():
						fh.close()

				# Clear buffer
				data_buffer = []
				num_dumps += 1

			if counter % log_interval == 0:
				print "Progress:", counter, "submissions dumped out of", num_submissions, (counter / float(num_submissions)) * 100, "%"
				print "\tNumber of URLs found so far:", num_urls_found
				print "\tNumber of xposts found so far:", num_xposts_found
				print "\tNumber of dumps to file so far:", num_dumps
				print "\tTime spent:", (time.time() - start) / 60.0, "minutes"
				print ""
			counter += 1
コード例 #2
0
					subdomains[subdomain] = {subreddit: 1}

				num_subdomains += 1

		# Attempt to extrapolate domain
		except AttributeError:
			domain = submission.get("domain")
			commenturl = submission.get("comment_url")
			url = submission.get("url")

			# Try looking at praw's domain extraction
			if domain and domain.startswith("self."):
				subreddit = domain.split(".")[1]
				num_extrapolated += 1
			elif domain == "reddit.com" and url:
				subreddit = utils.get_internal_link(url)
				num_extrapolated += 1

			# Otherwise look at comment url for hints
			elif commenturl:
				subreddit = utils.get_internal_link(commenturl)
				num_extrapolated += 1

			else:
				subreddit = None
				num_not_found += 1

			# If domain was found, can't infer subdomain, just try to find
			# xpost in title
			xpost = utils.extract_subreddit_xpost(submission.get("submission_title"))
コード例 #3
0
def dump_wayback_submission_info(sub_path):
	start = time.time()

	# Log useful info
	num_submissions = 18609213
	counter = 0
	num_dumps = 0
	num_urls_found = 0
	num_xposts_found = 0
	num_domains_found = 0
	num_no_date = 0

	# Will store chunks of data to write, cleared at certain intervals
	# to avoid too much I/O.
	data_buffer = {}

	# Will store chunks of data specifically for comment file, used to
	# scrape later comments.
	comment_file = open(os.path.join('..', 'data', 'reddit', 'wayback_comments_to_scrape.txt'), 'w')
	comment_buffer = []

	# Collect submissions
	with open(sub_path) as submission_file:
		for submission_raw in submission_file:

			submission = json.loads(submission_raw)

			# Get year, month, day
			try:
				tmstp = submission.get("created").values()[0]
			except AttributeError:
				print "WARNING::::no timestamp?", submission.get("created")
				continue
			if type(tmstp) == int:
				date = datetime.fromtimestamp(tmstp / 1000)
			elif type(tmstp) == datetime:
				date = tmstp
			else:
				num_no_date += 1
				continue

			if submission.get("comment_url"):
				subreddit = utils.get_internal_link(submission.get("comment_url"))
				if not subreddit:
					continue
			# Ignore ones that don't have associated subreddit
			else:
				continue

			year = str(date.year)
			month = str(date.month)
			day = str(date.day)

			path = os.path.join(subreddit[0].lower(), subreddit, year, month, day)
			make_dir(path)

			# Extract relevant pieces of information
			submission_id = str(submission.get("_id").values()[0])
			submission_title = submission.get("submission_title") or ''
			submission_prawid = submission.get("reddit_id") or '-1'

			# Create information to write to file
			submission_info = submission_id + "\t" + submission_prawid + "\t" + submission_title
			url_info = submission.get("url") or ''
			domain_info = submission.get("domain")

			# Get potential xpost info
			xpost_info = utils.extract_subreddit_xpost(submission_title)
			if not xpost_info:
				if domain_info.startswith("self."):
					internal = domain_info.split(".")[1]
					if internal != subreddit:
						xpost_info = internal

			# Create data object and store in buffer
			data = {'submissions': (submission_info + '\n').encode("utf8")}
			if url_info:
				data['urls'] = (submission_id + '\t' + url_info + '\n').encode("utf8")
				num_urls_found += 1
			if xpost_info:
				data['xposts'] = (submission_id + '\t' + xpost_info + '\n').encode("utf8")
				num_xposts_found += 1
			if domain_info:
				data['domains'] = (submission_id + '\t' + domain_info + '\n').encode("utf8")
				num_domains_found += 1

			try:
				data_buffer[path].append(data)
			except KeyError:
				data_buffer[path] = [data]

			# Store information about comments for later scraping
			if submission_prawid != '-1':
				comment_buffer.append((path, submission_prawid))

			# Write data and clear buffer every <chunk_size> submissions
			if len(data_buffer) >= chunk_size:

				# Only ever open 5 files at a time
				for path, data in data_buffer.iteritems():
					# Open files associated with this path
					filehandlers = {'submissions': open_file(path, 'submissions.txt'),
									'urls': open_file(path, 'urls.txt'),
									'xposts': open_file(path, 'xposts.txt'),
									'domains': open_file(path, 'domains.txt')}

					# Write data to files
					for data_item in data:
						if data_item.get('urls'):
							filehandlers['urls'].write(data_item['urls'])
						if data_item.get('xposts'):
							filehandlers['xposts'].write(data_item['xposts'])
						if data_item.get('domains'):
							filehandlers['domains'].write(data_item['domains'])

					# Now close files
					filehandlers['submissions'].close()
					filehandlers['urls'].close()
					filehandlers['xposts'].close()
					filehandlers['domains'].close()

				# Update comment file
				for path, pid in comment_buffer:
					comment_file.write(path + '\t' + pid + '\n')

				# Clear buffer
				data_buffer = {}
				comment_buffer = []
				num_dumps += 1

			counter += 1
			if counter % log_interval == 0:
				print "Wayback Progress:", counter, "submissions dumped out of", num_submissions, (counter / float(num_submissions)) * 100, "%"
				print "\tNumber of URLs found so far:", num_urls_found
				print "\tNumber of xposts found so far:", num_xposts_found
				print "\tNumber of dumps to file so far:", num_dumps
				print "\tNumber with no date:", num_no_date
				time_spent = (time.time() - start) / 60.0
				print "\tTime spent:", time_spent, "minutes"
				print "\tExpected time remaining:", (num_submissions - counter) / (counter / time_spent)
				print ""

	comment_file.close()