def compute_forum_stats(posts_file):
	'''
	Computes the following stats for now:
		- average # of posts in a thread in the posts file
		- average length of posts after we stem and remove terms of length <= 2
		- average # of users participating in a thread
	'''
	thread_posts_count = []
	post_lengths = []
	thread_users_count = []

	for thread in CancerReader.parse(posts_file, clean = True, stem = True):
		thread_posts_count.append(len(thread.posts))

		thread_users = set()

		for post in thread.posts:
			post_lengths.append(len(post.content.split()))

			thread_users.add(post.author_id)

		thread_users_count.append(len(thread_users))

	average_thread_posts_count = float(sum(thread_posts_count)) / float(len(thread_posts_count))
	average_post_length = float(sum(post_lengths)) / float(len(post_lengths))
	average_thread_user_count = float(sum(thread_users_count)) / float(len(thread_users_count))

	return average_thread_posts_count, average_post_length, average_thread_user_count
def compute_forum_user_statistics(posts_file):
	'''
	Computes the following user stats:
		- users who participate in more than once in a thread.
	'''

	user_post_info = {}
	users_set = set()


	for thread in CancerReader.parse(posts_file, clean = False, stem = False):
		thread_users_dict = {}
		for post in thread.posts:
			users_set.add(post.author_id)
			if post.author_id in thread_users_dict:
				thread_users_dict[post.author_id] += 1
			else:
				thread_users_dict[post.author_id] = 1

		for user, count in thread_users_dict.iteritems():
			if count > 1:
				# only record a user if there is > 1 post in this thread
				if user in user_post_info:
					user_post_info[user] += 1
				else:
					user_post_info[user] = 1

	num_users = len(users_set)
	num_users_thread_active = len(user_post_info)

	return num_users, num_users_thread_active
def generate_user_list_freqs(posts_file):
	users_freqs = {}

	for thread in CancerReader.parse(posts_file, clean = False, stem = False):
		for post in thread.posts:
			if post.author_id in users_freqs:
				users_freqs[post.author_id] += 1
			else:
				users_freqs[post.author_id] = 1

	return users_freqs
def generate_types_freqs(posts_file):
	result = {}

	for thread in CancerReader.parse(posts_file, clean = True, stem = True):
		for post in thread.posts:
			for term in post.content.split():
				if term in result:
					result[term] += 1
				else:
					result[term] = 1
		
	return result
def load_terms_file(terms_file):
	word_id_map = {}

	with open(terms_file, 'r') as terms_file_handle:
		for i, term in enumerate(terms_file_handle):

			word_id_map[term.strip()] = i

	return word_id_map

if __name__ == '__main__':

	terms_id_map = load_terms_file(sys.argv[1])
	
	for thread in CancerReader.parse(sys.argv[2], stem = True):
		for post in thread.posts:
			stemmed_terms = post.content.split()

			stemmed_id_terms = [terms_id_map[t] for t in stemmed_terms if t in terms_id_map]
			stemmed_id_terms_str = ' '.join(map(str, stemmed_id_terms))

			print '%(post_id)d|%(post_time_str)s|%(forum_id)d|%(thread_id)d|%(author_id)d|%(author_name)s|%(first_post_marker)d|%(title)s|%(content)s' %\
					{
						'post_id': post.post_id,
						'post_time_str': post.post_time_str,
						'forum_id': post.forum_id,
						'thread_id': post.thread_id,
						'author_id': post.author_id,
						'author_name': post.author_name,
						'first_post_marker': post.is_first_post,