def compute_forum_stats(posts_file): ''' Computes the following stats for now: - average # of posts in a thread in the posts file - average length of posts after we stem and remove terms of length <= 2 - average # of users participating in a thread ''' thread_posts_count = [] post_lengths = [] thread_users_count = [] for thread in CancerReader.parse(posts_file, clean = True, stem = True): thread_posts_count.append(len(thread.posts)) thread_users = set() for post in thread.posts: post_lengths.append(len(post.content.split())) thread_users.add(post.author_id) thread_users_count.append(len(thread_users)) average_thread_posts_count = float(sum(thread_posts_count)) / float(len(thread_posts_count)) average_post_length = float(sum(post_lengths)) / float(len(post_lengths)) average_thread_user_count = float(sum(thread_users_count)) / float(len(thread_users_count)) return average_thread_posts_count, average_post_length, average_thread_user_count
def compute_forum_user_statistics(posts_file): ''' Computes the following user stats: - users who participate in more than once in a thread. ''' user_post_info = {} users_set = set() for thread in CancerReader.parse(posts_file, clean = False, stem = False): thread_users_dict = {} for post in thread.posts: users_set.add(post.author_id) if post.author_id in thread_users_dict: thread_users_dict[post.author_id] += 1 else: thread_users_dict[post.author_id] = 1 for user, count in thread_users_dict.iteritems(): if count > 1: # only record a user if there is > 1 post in this thread if user in user_post_info: user_post_info[user] += 1 else: user_post_info[user] = 1 num_users = len(users_set) num_users_thread_active = len(user_post_info) return num_users, num_users_thread_active
def generate_user_list_freqs(posts_file): users_freqs = {} for thread in CancerReader.parse(posts_file, clean = False, stem = False): for post in thread.posts: if post.author_id in users_freqs: users_freqs[post.author_id] += 1 else: users_freqs[post.author_id] = 1 return users_freqs
def generate_types_freqs(posts_file): result = {} for thread in CancerReader.parse(posts_file, clean = True, stem = True): for post in thread.posts: for term in post.content.split(): if term in result: result[term] += 1 else: result[term] = 1 return result
def load_terms_file(terms_file): word_id_map = {} with open(terms_file, 'r') as terms_file_handle: for i, term in enumerate(terms_file_handle): word_id_map[term.strip()] = i return word_id_map if __name__ == '__main__': terms_id_map = load_terms_file(sys.argv[1]) for thread in CancerReader.parse(sys.argv[2], stem = True): for post in thread.posts: stemmed_terms = post.content.split() stemmed_id_terms = [terms_id_map[t] for t in stemmed_terms if t in terms_id_map] stemmed_id_terms_str = ' '.join(map(str, stemmed_id_terms)) print '%(post_id)d|%(post_time_str)s|%(forum_id)d|%(thread_id)d|%(author_id)d|%(author_name)s|%(first_post_marker)d|%(title)s|%(content)s' %\ { 'post_id': post.post_id, 'post_time_str': post.post_time_str, 'forum_id': post.forum_id, 'thread_id': post.thread_id, 'author_id': post.author_id, 'author_name': post.author_name, 'first_post_marker': post.is_first_post,