def test_filter_with_numbers(): words = filter_words('the player number 1 is now the winner') assert len(words) == 3 assert words == ['player', 'number', 'winner']
# process new words per chapter # if not know this word and is not in the top words yet if not db.zrank(keys.TOP_WORDS_IN_BOOK, w): db.hincrby(keys.NEW_WORDS_DISTRIBUTION_BY_CHAPTER, keys.CHAPTER_HASH_KEY.format(chapter)) db.zincrby(keys.TOP_WORDS_IN_BOOK, w) if __name__ == '__main__': from libepub.book import Book db = redis.StrictRedis(host='localhost', port=6379, db=0) db.delete(keys.WORDS_IN_BOOK) db.delete(keys.TOP_WORDS_IN_BOOK) db.delete(keys.NEW_WORDS_DISTRIBUTION_BY_CHAPTER) book = Book('../Being_Geek.epub') for chapter, c in enumerate(book.chapters[1:], 1): dump_words(db, chapter, filter_words(remove_html_tags(c.content))) print('All words of this book are stored in the database.') top_words = db.zrange(keys.TOP_WORDS_IN_BOOK, 0, 20, desc=True, withscores=True, score_cast_func=int) for w in top_words: print(w) print('number of words {:d}'.format(db.zcount(keys.TOP_WORDS_IN_BOOK, '-inf', '+inf'))) print('\nNumber of new words per chapter:') for key, value in sorted([(key, value) for key, value in db.hgetall(keys.NEW_WORDS_DISTRIBUTION_BY_CHAPTER).items()], key=lambda item: item[0]): print(key, value)
def test_filter_a_simple_sentence(): assert filter_words('this is a sentence') == ['sentence']
def test_some_random_sentence(): words = filter_words('Introduces ambiguity (unless the context happens to resolve it): read in.') assert words == ['introduces', 'ambiguity', 'unless', 'context', 'happens', 'resolve', 'read']
def test_filter_words_with_apostrophes(): words = filter_words('Jason`s and Sue\'s children!!!') assert words == ['jason`s', 'sue\'s', 'children']
def test_filter_words_with_dash(): words = filter_words('the player: 1 is now the JSON-like.') assert words == ['player', 'json-like']
def test_filter_words_with_comma(): words = filter_words('the player: 1 is now the winner.') assert words == ['player', 'winner']