Esempio n. 1
0
def main():
    lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt"))

    lines_per_chapter = common.split_lines_into_chapters(lines)

    words_per_chapter = [
        tuple(common.extract_words_from_lines(lines))
        for lines in lines_per_chapter
    ]

    # For each document separately build a word cloud
    # using obtained tf-idf weights.
    for chapter_idx, words in enumerate(words_per_chapter, 1):
        distinct_words = set(words)
        words_weights = {
            word: common.tf_idf_weights(word, words, words_per_chapter)
            for word in distinct_words
        }
        cloud = wordcloud.WordCloud(background_color="white", max_words=5000)
        cloud.generate_from_frequencies(words_weights)
        pathlib.Path('data/clouds/ex6/').mkdir(exist_ok=True)
        cloud.to_file(f"data/clouds/ex6/chapter-{chapter_idx}.png")

    # Build a word cloud based on tf-idf weights for the entire book.
    all_words = tuple(common.extract_words_from_lines(lines))
    distinct_words = set(all_words)
    words_weights = {
        word: common.tf_idf_weights(word, all_words, [all_words])
        for word in distinct_words
    }
    cloud = wordcloud.WordCloud(background_color="white", max_words=5000)
    cloud.generate_from_frequencies(words_weights)
    pathlib.Path('data/clouds/ex6/').mkdir(exist_ok=True)
    cloud.to_file(f"data/clouds/ex6/whole-book.png")
Esempio n. 2
0
def create_clouds_for_chapters(chapter_lines):
    for chapter_idx, chapter_lines in enumerate(chapters_lines, 1):
        words = common.extract_words_from_lines(chapter_lines)
        words_count = collections.Counter(words)
        cloud = wordcloud.WordCloud(background_color="white", max_words=20000)
        cloud.generate_from_frequencies(words_count)
        cloud.to_file(f"clouds/counts/chapter_{chapter_idx}_cloud.png")
Esempio n. 3
0
def main():
    lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt"))
    words = common.extract_words_from_lines(lines)
    words_count = collections.Counter(words)
    cloud = wordcloud.WordCloud(background_color="white", max_words=5000)
    cloud.generate_from_frequencies(words_count)
    pathlib.Path('data/clouds/ex5/').mkdir(exist_ok=True)
    cloud.to_file("data/clouds/ex5/cloud.png")
Esempio n. 4
0
def create_weighted_clouds_for_chapters(chapter_lines):
    chapters_words = [
        tuple(common.extract_words_from_lines(chapter_lines))
        for chapter_lines in chapters_lines
    ]

    for chapter_idx, words_in_chapter in enumerate(chapters_words, 1):
        words_in_chapter_weights = {}
        for word in set(words_in_chapter):
            words_in_chapter_weights[word] = (tf_idf_weights(
                word, words_in_chapter, chapters_words))

        cloud = wordcloud.WordCloud(background_color="white", max_words=20000)
        cloud.generate_from_frequencies(words_in_chapter_weights)
        cloud.to_file(f"clouds/tf_idf/chapter_{chapter_idx}_cloud.png")
def main():
    lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt"))
    lines_per_chapter = common.split_lines_into_chapters(lines)
    words_per_chapter = [
        tuple(common.extract_words_from_lines(lines))
        for lines in lines_per_chapter
    ]

    checked_word = get_first_arg()

    weights_in_chapters = []
    for chapter_idx, words in enumerate(words_per_chapter, 1):
        weight_in_chapter = common.tf_idf_weights(checked_word, words,
                                                  words_per_chapter)
        weights_in_chapters.append(
            'chapter=%s,weight=%s,count=%s' %
            (chapter_idx, weight_in_chapter, words.count(checked_word)))

    pprint.pprint(sorted(weights_in_chapters, key=operator.itemgetter(1)))
Esempio n. 6
0
def main():
    no_sentences = read_num_of_sentences_from_argv() or 1

    lines = common.read_lines_from_file("data/pride-and-prejudice.txt")
    words = tuple(common.extract_words_from_lines(lines))

    word_to_next_words = collections.defaultdict(list)

    for word, next_word in zip(words[:-1], words[1:]):
        word_to_next_words[word].append(next_word)

    word_to_most_common = {}

    for word, next_words in word_to_next_words.items():
        most_common = collections.Counter(next_words).most_common(5)
        word_to_most_common[word] = tuple(w[0] for w in most_common)

    for _ in range(no_sentences):
        print(get_random_sentence(word_to_most_common), end=' ')
    print()
Esempio n. 7
0
def create_clouds_for_whole_document(lines):
    words = common.extract_words_from_lines(lines)
    words_count = collections.Counter(words)
    cloud = wordcloud.WordCloud(background_color="white", max_words=20000)
    cloud.generate_from_frequencies(words_count)
    cloud.to_file("clouds/all_words_cloud.png")