def generate_training_sample(file_name, num_records, is_training_data=False): if num_records != 0: lines = np.array(common.read_lines_from_file(file_name)) sampled_lines = lines[ np.random.randint(len(lines), size=num_records), :] else: sampled_lines = np.array(common.read_lines_from_file(file_name)) return common.convert_lines_to_question_pairs(sampled_lines.tolist(), is_training_data)
def main(): lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt")) lines_per_chapter = common.split_lines_into_chapters(lines) words_per_chapter = [ tuple(common.extract_words_from_lines(lines)) for lines in lines_per_chapter ] # For each document separately build a word cloud # using obtained tf-idf weights. for chapter_idx, words in enumerate(words_per_chapter, 1): distinct_words = set(words) words_weights = { word: common.tf_idf_weights(word, words, words_per_chapter) for word in distinct_words } cloud = wordcloud.WordCloud(background_color="white", max_words=5000) cloud.generate_from_frequencies(words_weights) pathlib.Path('data/clouds/ex6/').mkdir(exist_ok=True) cloud.to_file(f"data/clouds/ex6/chapter-{chapter_idx}.png") # Build a word cloud based on tf-idf weights for the entire book. all_words = tuple(common.extract_words_from_lines(lines)) distinct_words = set(all_words) words_weights = { word: common.tf_idf_weights(word, all_words, [all_words]) for word in distinct_words } cloud = wordcloud.WordCloud(background_color="white", max_words=5000) cloud.generate_from_frequencies(words_weights) pathlib.Path('data/clouds/ex6/').mkdir(exist_ok=True) cloud.to_file(f"data/clouds/ex6/whole-book.png")
def main(): lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt")) words = common.extract_words_from_lines(lines) words_count = collections.Counter(words) cloud = wordcloud.WordCloud(background_color="white", max_words=5000) cloud.generate_from_frequencies(words_count) pathlib.Path('data/clouds/ex5/').mkdir(exist_ok=True) cloud.to_file("data/clouds/ex5/cloud.png")
def main(): lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt")) lines_per_chapter = common.split_lines_into_chapters(lines) words_per_chapter = [ tuple(common.extract_words_from_lines(lines)) for lines in lines_per_chapter ] checked_word = get_first_arg() weights_in_chapters = [] for chapter_idx, words in enumerate(words_per_chapter, 1): weight_in_chapter = common.tf_idf_weights(checked_word, words, words_per_chapter) weights_in_chapters.append( 'chapter=%s,weight=%s,count=%s' % (chapter_idx, weight_in_chapter, words.count(checked_word))) pprint.pprint(sorted(weights_in_chapters, key=operator.itemgetter(1)))
def main(): no_sentences = read_num_of_sentences_from_argv() or 1 lines = common.read_lines_from_file("data/pride-and-prejudice.txt") words = tuple(common.extract_words_from_lines(lines)) word_to_next_words = collections.defaultdict(list) for word, next_word in zip(words[:-1], words[1:]): word_to_next_words[word].append(next_word) word_to_most_common = {} for word, next_words in word_to_next_words.items(): most_common = collections.Counter(next_words).most_common(5) word_to_most_common[word] = tuple(w[0] for w in most_common) for _ in range(no_sentences): print(get_random_sentence(word_to_most_common), end=' ') print()
import common filename = common.get_filename() lines1 = common.read_lines_from_file(filename) import requests import re import sys import unicodedata from bs4 import BeautifulSoup import os from yandex_translate import YandexTranslate translate = YandexTranslate( 'trnsl.1.1.20170222T052338Z.74276f7925a61714.f88a6340cffbdf48e1600c4cfb2f9ba3a22c514f' ) urls = [] values2 = [] keys_ads = [ 'remove', 'ad', 'ads', 'virus', 'advertising', 'pop-up', 'pop-ups', 'Remove', 'Ad', 'Ads', 'Virus', 'Advertising', 'Pop-up', 'Pop-ups', 'remove.', 'ad.', 'ads.', 'virus.', 'advertising.', 'pop-up.', 'pop-ups.', 'Remove.', 'Ad.', 'Ads.', 'Virus.', 'Advertising.', 'Pop-up.', 'Pop-ups.' 'remove,', 'ad,', 'ads,', 'virus,', 'advertising,', 'pop-up,', 'pop-ups,', 'Remove,', 'Ad,', 'Ads,', 'Virus,', 'Advertising,', 'Pop-up,', 'Pop-ups,' ] keys_commerce = [ 'buy', 'shop', 'shopping', 'buyers', 'sellers', 'online-store', 'online-marketplace', 'commerce', 'e-commerce', 'import', 'imports', 'export', 'exports', 'Buy', 'Shop', 'Shopping', 'Buyers', 'Sellers',
def main(): lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt"))