def main(): lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt")) lines_per_chapter = common.split_lines_into_chapters(lines) words_per_chapter = [ tuple(common.extract_words_from_lines(lines)) for lines in lines_per_chapter ] # For each document separately build a word cloud # using obtained tf-idf weights. for chapter_idx, words in enumerate(words_per_chapter, 1): distinct_words = set(words) words_weights = { word: common.tf_idf_weights(word, words, words_per_chapter) for word in distinct_words } cloud = wordcloud.WordCloud(background_color="white", max_words=5000) cloud.generate_from_frequencies(words_weights) pathlib.Path('data/clouds/ex6/').mkdir(exist_ok=True) cloud.to_file(f"data/clouds/ex6/chapter-{chapter_idx}.png") # Build a word cloud based on tf-idf weights for the entire book. all_words = tuple(common.extract_words_from_lines(lines)) distinct_words = set(all_words) words_weights = { word: common.tf_idf_weights(word, all_words, [all_words]) for word in distinct_words } cloud = wordcloud.WordCloud(background_color="white", max_words=5000) cloud.generate_from_frequencies(words_weights) pathlib.Path('data/clouds/ex6/').mkdir(exist_ok=True) cloud.to_file(f"data/clouds/ex6/whole-book.png")
def create_clouds_for_chapters(chapter_lines): for chapter_idx, chapter_lines in enumerate(chapters_lines, 1): words = common.extract_words_from_lines(chapter_lines) words_count = collections.Counter(words) cloud = wordcloud.WordCloud(background_color="white", max_words=20000) cloud.generate_from_frequencies(words_count) cloud.to_file(f"clouds/counts/chapter_{chapter_idx}_cloud.png")
def main(): lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt")) words = common.extract_words_from_lines(lines) words_count = collections.Counter(words) cloud = wordcloud.WordCloud(background_color="white", max_words=5000) cloud.generate_from_frequencies(words_count) pathlib.Path('data/clouds/ex5/').mkdir(exist_ok=True) cloud.to_file("data/clouds/ex5/cloud.png")
def create_weighted_clouds_for_chapters(chapter_lines): chapters_words = [ tuple(common.extract_words_from_lines(chapter_lines)) for chapter_lines in chapters_lines ] for chapter_idx, words_in_chapter in enumerate(chapters_words, 1): words_in_chapter_weights = {} for word in set(words_in_chapter): words_in_chapter_weights[word] = (tf_idf_weights( word, words_in_chapter, chapters_words)) cloud = wordcloud.WordCloud(background_color="white", max_words=20000) cloud.generate_from_frequencies(words_in_chapter_weights) cloud.to_file(f"clouds/tf_idf/chapter_{chapter_idx}_cloud.png")
def main(): lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt")) lines_per_chapter = common.split_lines_into_chapters(lines) words_per_chapter = [ tuple(common.extract_words_from_lines(lines)) for lines in lines_per_chapter ] checked_word = get_first_arg() weights_in_chapters = [] for chapter_idx, words in enumerate(words_per_chapter, 1): weight_in_chapter = common.tf_idf_weights(checked_word, words, words_per_chapter) weights_in_chapters.append( 'chapter=%s,weight=%s,count=%s' % (chapter_idx, weight_in_chapter, words.count(checked_word))) pprint.pprint(sorted(weights_in_chapters, key=operator.itemgetter(1)))
def main(): no_sentences = read_num_of_sentences_from_argv() or 1 lines = common.read_lines_from_file("data/pride-and-prejudice.txt") words = tuple(common.extract_words_from_lines(lines)) word_to_next_words = collections.defaultdict(list) for word, next_word in zip(words[:-1], words[1:]): word_to_next_words[word].append(next_word) word_to_most_common = {} for word, next_words in word_to_next_words.items(): most_common = collections.Counter(next_words).most_common(5) word_to_most_common[word] = tuple(w[0] for w in most_common) for _ in range(no_sentences): print(get_random_sentence(word_to_most_common), end=' ') print()
def create_clouds_for_whole_document(lines): words = common.extract_words_from_lines(lines) words_count = collections.Counter(words) cloud = wordcloud.WordCloud(background_color="white", max_words=20000) cloud.generate_from_frequencies(words_count) cloud.to_file("clouds/all_words_cloud.png")