def get_subreddit_vocabularies(): # Initialise Reddit word counter instance reddit_counter = RedditWordCounter(USERNAME) # Initialise tf-idf corpus instance corpus_path = os.path.join(SAVE_DIR, CORPUS_FILE) comment_corpus = TfidfCorpus(corpus_path) # Extract the vocabulary for each of the subreddits specified subreddit_queue = deque([subreddit for subreddit in SUBREDDITS]) while len(subreddit_queue) > 0: subreddit = subreddit_queue.popleft() try: vocabulary = reddit_counter.subreddit_comments( subreddit, limit=COMMENTS_PER_SUBREDDIT) except requests.exceptions.HTTPError as err: print err # Add subreddit back into queue subreddit_queue.append(subreddit) continue comment_corpus.add_document(vocabulary, subreddit) comment_corpus.save() return comment_corpus, corpus_path
def get_subreddit_vocabularies(): # Initialise Reddit word counter instance reddit_counter = RedditWordCounter(USERNAME) # Initialise tf-idf corpus instance corpus_path = os.path.join(SAVE_DIR, CORPUS_FILE) comment_corpus = TfidfCorpus(corpus_path) # Extract the vocabulary for each of the subreddits specified subreddit_queue = deque([subreddit for subreddit in SUBREDDITS]) while len(subreddit_queue) > 0: subreddit = subreddit_queue.popleft() try: vocabulary = reddit_counter.subreddit_comments(subreddit, limit=COMMENTS_PER_SUBREDDIT) except requests.exceptions.HTTPError as err: print err # Add subreddit back into queue subreddit_queue.append(subreddit) continue comment_corpus.add_document(vocabulary, subreddit) comment_corpus.save() return comment_corpus, corpus_path