def get_subreddit_vocabularies(): # Initialise Reddit word counter instance reddit_counter = RedditWordCounter(USERNAME) # Initialise tf-idf corpus instance corpus_path = os.path.join(SAVE_DIR, CORPUS_FILE) comment_corpus = TfidfCorpus(corpus_path) # Extract the vocabulary for each of the subreddits specified subreddit_queue = deque([subreddit for subreddit in SUBREDDITS]) while len(subreddit_queue) > 0: subreddit = subreddit_queue.popleft() try: vocabulary = reddit_counter.subreddit_comments( subreddit, limit=COMMENTS_PER_SUBREDDIT) except requests.exceptions.HTTPError as err: print err # Add subreddit back into queue subreddit_queue.append(subreddit) continue comment_corpus.add_document(vocabulary, subreddit) comment_corpus.save() return comment_corpus, corpus_path
def get_subreddit_vocabularies(): # Initialise Reddit word counter instance reddit_counter = RedditWordCounter(USERNAME) # Initialise tf-idf corpus instance corpus_path = os.path.join(SAVE_DIR, CORPUS_FILE) comment_corpus = TfidfCorpus(corpus_path) # Extract the vocabulary for each of the subreddits specified subreddit_queue = deque([subreddit for subreddit in SUBREDDITS]) while len(subreddit_queue) > 0: subreddit = subreddit_queue.popleft() try: vocabulary = reddit_counter.subreddit_comments(subreddit, limit=COMMENTS_PER_SUBREDDIT) except requests.exceptions.HTTPError as err: print err # Add subreddit back into queue subreddit_queue.append(subreddit) continue comment_corpus.add_document(vocabulary, subreddit) comment_corpus.save() return comment_corpus, corpus_path
# Extract their word counts corpus, corpus_path = get_subreddit_vocabularies() print 'TF-IDF corpus saved to %s' % corpus_path # Get the top words by subreddit top_terms_path = save_subreddit_top_terms(corpus) print 'Top terms saved to %s' % corpus_path # Get the swearword frequency swearword_frequency = get_swearword_counts(corpus) print 'Normalized swearword frequency:' for subreddit, frequency in swearword_frequency.items(): print '%s, %s' % (subreddit, frequency) # Get the average word length print '\nAverage word length by subreddit:' word_lengths = get_vocabulary_sophistication(corpus) for subreddit, frequency in word_lengths.items(): print '%s, %s' % (subreddit, frequency) ####################### # MACHINE LEARNING DEMO ####################### # Collect the comments for a particular user and determine which subreddit their comments best match up with counter = RedditWordCounter(USERNAME) corpus = TfidfCorpus(os.path.join(SAVE_DIR, CORPUS_FILE)) user_comments = counter.user_comments('way_fairer') corpus.train_classifier(classifier_type='LinearSVC', tfidf=True) print corpus.classify_document(user_comments)