コード例 #1
0
ファイル: example.py プロジェクト: SearchPilot/reddit-nlp
def get_subreddit_vocabularies():
    # Initialise Reddit word counter instance
    reddit_counter = RedditWordCounter(USERNAME)

    # Initialise tf-idf corpus instance
    corpus_path = os.path.join(SAVE_DIR, CORPUS_FILE)
    comment_corpus = TfidfCorpus(corpus_path)

    # Extract the vocabulary for each of the subreddits specified
    subreddit_queue = deque([subreddit for subreddit in SUBREDDITS])
    while len(subreddit_queue) > 0:
        subreddit = subreddit_queue.popleft()

        try:
            vocabulary = reddit_counter.subreddit_comments(
                subreddit, limit=COMMENTS_PER_SUBREDDIT)
        except requests.exceptions.HTTPError as err:
            print err
            # Add subreddit back into queue
            subreddit_queue.append(subreddit)
            continue

        comment_corpus.add_document(vocabulary, subreddit)
        comment_corpus.save()

    return comment_corpus, corpus_path
コード例 #2
0
ファイル: example.py プロジェクト: John-Keating/reddit-nlp
def get_subreddit_vocabularies():
    # Initialise Reddit word counter instance
    reddit_counter = RedditWordCounter(USERNAME)

    # Initialise tf-idf corpus instance
    corpus_path = os.path.join(SAVE_DIR, CORPUS_FILE)
    comment_corpus = TfidfCorpus(corpus_path)

    # Extract the vocabulary for each of the subreddits specified
    subreddit_queue = deque([subreddit for subreddit in SUBREDDITS])
    while len(subreddit_queue) > 0:
        subreddit = subreddit_queue.popleft()

        try:
            vocabulary = reddit_counter.subreddit_comments(subreddit, limit=COMMENTS_PER_SUBREDDIT)
        except requests.exceptions.HTTPError as err:
            print err
            # Add subreddit back into queue
            subreddit_queue.append(subreddit)
            continue

        comment_corpus.add_document(vocabulary, subreddit)
        comment_corpus.save()

    return comment_corpus, corpus_path