def generate_training_sample(file_name, num_records, is_training_data=False):
    if num_records != 0:
        lines = np.array(common.read_lines_from_file(file_name))
        sampled_lines = lines[
            np.random.randint(len(lines), size=num_records), :]
    else:
        sampled_lines = np.array(common.read_lines_from_file(file_name))
    return common.convert_lines_to_question_pairs(sampled_lines.tolist(),
                                                  is_training_data)
Beispiel #2
0
def main():
    lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt"))

    lines_per_chapter = common.split_lines_into_chapters(lines)

    words_per_chapter = [
        tuple(common.extract_words_from_lines(lines))
        for lines in lines_per_chapter
    ]

    # For each document separately build a word cloud
    # using obtained tf-idf weights.
    for chapter_idx, words in enumerate(words_per_chapter, 1):
        distinct_words = set(words)
        words_weights = {
            word: common.tf_idf_weights(word, words, words_per_chapter)
            for word in distinct_words
        }
        cloud = wordcloud.WordCloud(background_color="white", max_words=5000)
        cloud.generate_from_frequencies(words_weights)
        pathlib.Path('data/clouds/ex6/').mkdir(exist_ok=True)
        cloud.to_file(f"data/clouds/ex6/chapter-{chapter_idx}.png")

    # Build a word cloud based on tf-idf weights for the entire book.
    all_words = tuple(common.extract_words_from_lines(lines))
    distinct_words = set(all_words)
    words_weights = {
        word: common.tf_idf_weights(word, all_words, [all_words])
        for word in distinct_words
    }
    cloud = wordcloud.WordCloud(background_color="white", max_words=5000)
    cloud.generate_from_frequencies(words_weights)
    pathlib.Path('data/clouds/ex6/').mkdir(exist_ok=True)
    cloud.to_file(f"data/clouds/ex6/whole-book.png")
Beispiel #3
0
def main():
    lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt"))
    words = common.extract_words_from_lines(lines)
    words_count = collections.Counter(words)
    cloud = wordcloud.WordCloud(background_color="white", max_words=5000)
    cloud.generate_from_frequencies(words_count)
    pathlib.Path('data/clouds/ex5/').mkdir(exist_ok=True)
    cloud.to_file("data/clouds/ex5/cloud.png")
def main():
    lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt"))
    lines_per_chapter = common.split_lines_into_chapters(lines)
    words_per_chapter = [
        tuple(common.extract_words_from_lines(lines))
        for lines in lines_per_chapter
    ]

    checked_word = get_first_arg()

    weights_in_chapters = []
    for chapter_idx, words in enumerate(words_per_chapter, 1):
        weight_in_chapter = common.tf_idf_weights(checked_word, words,
                                                  words_per_chapter)
        weights_in_chapters.append(
            'chapter=%s,weight=%s,count=%s' %
            (chapter_idx, weight_in_chapter, words.count(checked_word)))

    pprint.pprint(sorted(weights_in_chapters, key=operator.itemgetter(1)))
Beispiel #5
0
def main():
    no_sentences = read_num_of_sentences_from_argv() or 1

    lines = common.read_lines_from_file("data/pride-and-prejudice.txt")
    words = tuple(common.extract_words_from_lines(lines))

    word_to_next_words = collections.defaultdict(list)

    for word, next_word in zip(words[:-1], words[1:]):
        word_to_next_words[word].append(next_word)

    word_to_most_common = {}

    for word, next_words in word_to_next_words.items():
        most_common = collections.Counter(next_words).most_common(5)
        word_to_most_common[word] = tuple(w[0] for w in most_common)

    for _ in range(no_sentences):
        print(get_random_sentence(word_to_most_common), end=' ')
    print()
Beispiel #6
0
import common

filename = common.get_filename()
lines1 = common.read_lines_from_file(filename)

import requests
import re
import sys
import unicodedata
from bs4 import BeautifulSoup
import os
from yandex_translate import YandexTranslate
translate = YandexTranslate(
    'trnsl.1.1.20170222T052338Z.74276f7925a61714.f88a6340cffbdf48e1600c4cfb2f9ba3a22c514f'
)
urls = []
values2 = []

keys_ads = [
    'remove', 'ad', 'ads', 'virus', 'advertising', 'pop-up', 'pop-ups',
    'Remove', 'Ad', 'Ads', 'Virus', 'Advertising', 'Pop-up', 'Pop-ups',
    'remove.', 'ad.', 'ads.', 'virus.', 'advertising.', 'pop-up.', 'pop-ups.',
    'Remove.', 'Ad.', 'Ads.', 'Virus.', 'Advertising.', 'Pop-up.', 'Pop-ups.'
    'remove,', 'ad,', 'ads,', 'virus,', 'advertising,', 'pop-up,', 'pop-ups,',
    'Remove,', 'Ad,', 'Ads,', 'Virus,', 'Advertising,', 'Pop-up,', 'Pop-ups,'
]

keys_commerce = [
    'buy', 'shop', 'shopping', 'buyers', 'sellers', 'online-store',
    'online-marketplace', 'commerce', 'e-commerce', 'import', 'imports',
    'export', 'exports', 'Buy', 'Shop', 'Shopping', 'Buyers', 'Sellers',
Beispiel #7
0
def main():
    lines = tuple(common.read_lines_from_file("data/pride-and-prejudice.txt"))