Exemple #1
0
 def test_get_lexical_diversity_dictionary(self):
     book = Book(title='t',
                 author='a',
                 year_published=1,
                 content_path='/tmp/mauve_tok')
     self.assertEquals(
         1 / 2., book.get_lexical_diversity(only_dictionary_words=True))
Exemple #2
0
 def test_is_genre(self):
     tags = Tags()
     tags.append(Tag(name='something'))
     tags.append(Tag(name='something else'))
     book = Book(title='t', author='a', year_published=1, tags=tags)
     self.assertFalse(book.is_genre('som'))
     self.assertTrue(book.is_genre('something'))
Exemple #3
0
    def test_safe_to_use(self):
        book = Book(title='t',
                    author='Author',
                    year_published=2020,
                    num_ratings=100,
                    content_path='/tmp/mauve/isbn___Author___title.txt')
        self.assertTrue(book.safe_to_use)

        book = Book(title='t',
                    author='Author',
                    year_published=2020,
                    num_ratings=0,
                    content_path='/tmp/mauve/isbn___Author___title.txt')
        self.assertFalse(book.safe_to_use)

        book = Book(title='t',
                    author='Author',
                    year_published=1700,
                    num_ratings=100,
                    content_path='/tmp/mauve/isbn___Author___title.txt')
        self.assertFalse(book.safe_to_use)

        book = Book(title='t',
                    author='Arthor',
                    year_published=2020,
                    num_ratings=100,
                    content_path='/tmp/mauve/isbn___Author___title.txt')
        self.assertFalse(book.safe_to_use)
Exemple #4
0
 def test_get_token_type_score(self):
     book = Book(title='t',
                 author='a',
                 year_published=1,
                 content_path='/tmp/mauve_tok')
     self.assertAlmostEqual((2 / 3.) * 10000,
                            book.get_token_type_score('adjective'))
     self.assertAlmostEqual((1 / 3.) * 10000,
                            book.get_token_type_score('verb'))
    def test_to_array(self):
        base = AlwaysIncludeBaseTaggedDocs()
        books = [
            Book(title='1', author='a', year_published=0),
            Book(title='2', author='a', year_published=0)
        ]
        base.load(books[0])
        base.load(books[1])

        copied_base = copy.deepcopy(base)

        self.assertEqual(base.to_array(), list([i for i in copied_base]))
    def test_iter(self):
        base = AlwaysIncludeBaseTaggedDocs()
        books = [
            Book(title='1', author='a', year_published=0),
            Book(title='2', author='a', year_published=0)
        ]
        base.load(books[0])
        base.load(books[1])

        for idx, b in enumerate(base):
            if idx == 0:
                self.assertEqual(b.tags, ['1_0'])
            else:
                self.assertEqual(b.tags, ['2_0'])
Exemple #7
0
    def test_serialize(self):
        # NOTE: This will probably change a fair bit over time but just
        #       want to be aware if it changes
        book = Book(title='t',
                    author='Arthur',
                    year_published=2020,
                    num_ratings=100,
                    content_path='/tmp/mauve_tok/NOPE___AUTHOR___TITLE.txt')

        self.assertEqual(
            book.serialize(), {
                'analysis_version': int(constants.ANALYSIS_VERSION),
                'author_similarity': False,
                'title': 't',
                'author': 'Arthur',
                'author_gender': 'male',
                'year_published': 2020,
                'publisher': None,
                'isbn': None,
                'isbn13': None,
                'subtitle': None,
                'avg_rating': None,
                'author_nationality': None,
                'author_birth_year': None,
                'num_ratings': 100,
                'tags': [],
                'reviews': [],
                'word_count': 3,
                'lexical_diversity': 1.0,
                'avg_word_len': 2.6666666666666665,
                'profanity_score': 0,
                'avg_sentence_word_len': 3,
                'avg_sentence_char_len': 10,
                'adverb_score': 0.0,
                'interjection_score': 0.0,
                'adjective_score': 3333.3333333333335,
                'top_adjectives': {
                    'blue': 1
                },
                'top_nouns': {},
                'top_verbs': {
                    'is': 1
                },
                'flesch_reading_ease_score': 119.19,
                'reading_difficulty': 0,
                'reading_time': 0.7201152184349495,
                'sentiment': 0,
                'cliche_score': 0
            })
Exemple #8
0
def iter_books(source='goodreads') -> Iterator:
    """

    :kwarg source:
    :kwarg: the v of tokens to get from
    :return: generator of book objects
    """
    from mauve.models.books.book import Book
    for book_meta in get_metadata(source=source):
        content_path = os.path.join(TEXT_PATH, book_meta['original_filename'])

        genres = book_meta.get('genres', [])

        tags = Tags()
        for genre in genres:
            tags.append(Tag(name=genre))

        book = Book(title=book_meta.get('book_title', None),
                    isbn=book_meta.get('isbn', None),
                    isbn13=book_meta.get('isbn13', None),
                    year_published=book_meta.get('year_first_published', None),
                    author=book_meta.get('author', None),
                    avg_rating=book_meta.get('average_rating', None),
                    tags=tags,
                    num_ratings=book_meta.get('num_ratings', None),
                    content_path=content_path)

        yield book
 def test_load(self):
     base = AlwaysIncludeBaseTaggedDocs()
     self.assertEqual(base.items, [])
     self.assertEqual(base.num_items, 0)
     book = Book(title='t', author='a', year_published=0)
     base.load(book)
     self.assertEqual(base.items, [book])
     self.assertEqual(base.num_items, 1)
Exemple #10
0
 def test_set_reviews(self):
     book = Book(title='t',
                 author='Arthur',
                 year_published=2020,
                 num_ratings=100,
                 reviews=Reviews(data=[Review(user='******', score=5)]))
     self.assertEqual(book.reviews.serialize(), [{
         'score': 5,
         'user': '******'
     }])
Exemple #11
0
 def test_sentences_tokens(self):
     book = Book(title='t',
                 author='a',
                 year_published=1,
                 content_path='/tmp/mauve_tok')
     self.assertEquals(book.sentences_tokens,
                       [[('I', 'PRP'), ("am", 'VBP'), ('a', 'DT'),
                         ('little', 'JJ'), ('teapot', 'NN'), ('.', '.')],
                        [('Really', 'RB'), (',', ','), ('I', 'PRP'),
                         ('am', 'VBP'),
                         ('!', '.')], [('Right', 'NNP'), ('?', '.')]])
Exemple #12
0
 def should_include(self, book: Book) -> bool:
     if self.get_group_name(book) is None:
         return False
     if book.num_ratings < 1000:
         return False
     if not book.author_similarity:
         return False
     if not book.is_genre('fiction'):
         return False
     if book.word_count < 10000:
         return False
     return True
Exemple #13
0
 def get_is_usable(self, book: Book) -> None:
     """
     Get if the given book passes the requirements to be
     included in the data.
     """
     if (self.required_genre and not book.is_genre(self.required_genre)
         ) or (self.required_safe_to_use and not book.safe_to_use) or (
             self.required_lang and
             not book.lang == self.required_lang) or (not book.has_content):
         logger.debug('\'%s\' by \'%s\'is not usable', book.title,
                      book.author.name)
         return False
     return True
Exemple #14
0
    def test_author_similarity(self):
        book = Book(title='t',
                    author='Author',
                    year_published=1,
                    content_path='/tmp/mauve/isbn___Author___title.txt')
        self.assertTrue(book.author_similarity)

        book = Book(title='t',
                    author='Author',
                    year_published=1,
                    content_path='/tmp/mauve/isbn___Author M___title.txt')
        self.assertTrue(book.author_similarity)

        book = Book(title='t',
                    author='Author',
                    year_published=1,
                    content_path='/tmp/mauve/isbn___Author M___title.txt')
        self.assertTrue(book.author_similarity)

        book = Book(title='t',
                    author='Arthor',
                    year_published=1,
                    content_path='/tmp/mauve/isbn___Author___title.txt')
        self.assertFalse(book.author_similarity)
Exemple #15
0
    def test_compress_file(self):
        book = Book(title='t',
                    author='a',
                    year_published=0,
                    content_path=self.clean_epub_1)
        book.all_tokens
        book.word_tokens

        compress_file(book.all_tokens_pickle_path)
        self.assertTrue(os.path.exists(book.all_tokens_pickle_path + '.bz'))

        content_pickle = get_file_content(book.all_tokens_pickle_path)
        content_bz = get_file_content(book.all_tokens_pickle_path + '.bz')

        self.assertEqual(content_pickle, content_bz)

        with self.assertRaises(NotImplementedError):
            compress_file(self.text_path_1)
Exemple #16
0
    def update_groups(self, book: Book) -> None:
        group_names = set(self.grouper(book))
        if self.only_groups is not None:
            group_names = group_names.intersection(set(self.only_groups))
        if group_names == set():
            return None

        counts = book.get_word_counts(only_include_words=self.only_words)
        local_words = counts.keys()
        tot = len(book.words)

        for group_name in group_names:
            if self.only_words is None:
                # TODO: take global words from all groups
                global_words = self.groups[group_name].keys()
                for missing_word in set(global_words) - set(list(local_words)):
                    counts[missing_word] = 0
            else:
                for missing_word in set(self.only_words) - set(
                        list(local_words)):
                    counts[missing_word] = 0

            for word, times_used in counts.items():
                try:
                    if self.method == 'by_book':
                        self.groups[group_name][word].append(1)
                    elif self.method == 'by_word':
                        self.groups[group_name][word].append(times_used / tot)
                    else:
                        raise Exception()
                except KeyError:
                    if self.method == 'by_book':
                        self.groups[group_name][
                            word] = [0] * self.prevs[group_name] + [1]
                    elif self.method == 'by_word':
                        self.groups[group_name][word] = [
                            0
                        ] * self.prevs[group_name] + [times_used / tot]
                    else:
                        raise Exception()

            self.prevs[group_name] += 1
    def test_compress_files(self):
        book = Book(title='t',
                    author='a',
                    year_published=0,
                    content_path=self.clean_epub_1)
        book.all_tokens
        book.word_tokens

        content_pickle = get_file_content(book.word_tokens_pickle_path)

        compress(num_processes=1)

        # make sure files deleted and bz created

        self.assertTrue(os.path.exists(book.word_tokens_pickle_path + '.bz'))
        self.assertFalse(os.path.exists(book.word_tokens_pickle_path))

        content_bz = get_file_content(book.word_tokens_pickle_path + '.bz')

        self.assertEqual(content_pickle, content_bz)
Exemple #18
0
 def test_get_top_adjectives(self):
     book = Book(title='t',
                 author='a',
                 year_published=1,
                 content_path='/tmp/mauve_tok')
     self.assertEquals(book.get_top_adjectives(10), {'happy': 2, 'big': 1})
Exemple #19
0
 def test_get_top_verbs(self):
     book = Book(title='t',
                 author='a',
                 year_published=1,
                 content_path='/tmp/mauve_tok')
     self.assertEquals(book.get_top_verbs(10), {'go': 2, 'run': 1})
Exemple #20
0
 def test_get_lexical_diversity_2(self):
     book = Book(title='t',
                 author='a',
                 year_published=1,
                 content_path='/tmp/mauve_tok')
     self.assertEquals(1 / 3., book.get_lexical_diversity())
Exemple #21
0
 def test_get_top_nouns(self):
     book = Book(title='t',
                 author='a',
                 year_published=1,
                 content_path='/tmp/mauve_tok')
     self.assertEquals(book.get_top_nouns(10), {'pencil': 2, 'house': 1})
Exemple #22
0
 def test_get_profanity_score_3(self):
     book = Book(title='t',
                 author='a',
                 year_published=1,
                 content_path='/tmp/mauve_tok')
     self.assertAlmostEqual((2 / 3.) * 10000, book.get_profanity_score())
Exemple #23
0
 def test_adverbs(self):
     book = Book(title='t',
                 author='a',
                 year_published=1,
                 content_path='/tmp/mauve_tok')
     self.assertEquals(book.adverbs, ['quietly'])
Exemple #24
0
 def test_adjectives(self):
     book = Book(title='t',
                 author='a',
                 year_published=1,
                 content_path='/tmp/mauve_tok')
     self.assertEquals(book.adjectives, ['blue'])