def _get_epub_standard_word_count(iterator, lang='en'): ''' This algorithm counts individual words instead of pages ''' book_text = _read_epub_contents(iterator, strip_html=True) try: from calibre.spell.break_iterator import count_words wordcount = count_words(book_text, lang) logger.debug('\tWord count - count_words method:%s' % wordcount) except: try: # The above method is new and no-one will have it as of 08/01/2016. Use an older method for a beta. from calibre.spell.break_iterator import split_into_words_and_positions wordcount = len(split_into_words_and_positions(book_text, lang)) logger.debug( '\tWord count - split_into_words_and_positions method:%s' % wordcount) except: from calibre.utils.wordcount import get_wordcount_obj wordcount = get_wordcount_obj(book_text) wordcount = wordcount.words logger.debug('\tWord count - old method:%s' % wordcount) return wordcount
def test_break_iterator(self): ' Test the break iterator ' from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions for q in ('one two three', ' one two three', 'one\ntwo three ', ): self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q) self.ae(split(u'I I\'m'), ['I', "I'm"]) self.ae(split(u'out-of-the-box'), ['out-of-the-box']) self.ae(split(u'-one two-'), ['one', 'two']) self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)]) for needle, haystack, pos in ( ('word', 'a word b', 2), ('word', 'a word', 2), ('one-two', 'a one-two punch', 2), ('one-two', 'one-two punch', 0), ('one-two', 'one-two', 0), ('one', 'one-two one', 8), ('one-two', 'one-two-three one-two', 14), ('one', 'onet one', 5), ('two', 'one-two two', 8), ('i', 'i', 0), ('i', 'six i', 4), ('i', '', -1), ('', '', -1), ('', 'i', -1), ('i', 'six clicks', -1), ('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)), ): fpos = index_of(needle, haystack) self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))
def process_text(state, text, nbsp_format, spell_format, user_data): ans = [] fmt = None if state.is_bold or state.is_italic: fmt = SyntaxTextCharFormat() if state.is_bold: fmt.setFontWeight(QFont.Bold) if state.is_italic: fmt.setFontItalic(True) last = 0 for m in nbsp_pat.finditer(text): ans.extend([(m.start() - last, fmt), (m.end() - m.start(), nbsp_format)]) last = m.end() if not ans: ans = [(len(text), fmt)] elif last < len(text): ans.append((len(text) - last, fmt)) if tprefs[ 'inline_spell_check'] and state.tags and user_data.tag_ok_for_spell( state.tags[-1].name) and hasattr(dictionaries, 'active_user_dictionaries'): split_ans = [] locale = state.current_lang or dictionaries.default_locale sfmt = SyntaxTextCharFormat(spell_format) if fmt is not None: sfmt.merge(fmt) tpos = 0 for tlen, fmt in ans: if fmt is nbsp_format: split_ans.append((tlen, fmt)) else: ctext = text[tpos:tpos + tlen] ppos = 0 for start, length in split_into_words_and_positions( ctext, lang=locale.langcode): if start > ppos: split_ans.append((start - ppos, fmt)) ppos = start + length recognized = dictionaries.recognized( ctext[start:ppos], locale) if not recognized: wsfmt = SyntaxTextCharFormat(sfmt) wsfmt.setProperty(SPELL_PROPERTY, (ctext[start:ppos], locale)) split_ans.append((length, fmt if recognized else wsfmt)) if ppos < tlen: split_ans.append((tlen - ppos, fmt)) tpos += tlen ans = split_ans return ans
def test_break_iterator(self): ' Test the break iterator ' from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions, count_words for q in ( 'one two three', ' one two three', 'one\ntwo three ', ): self.ae(split(str(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q) self.ae(split('I I\'m'), ['I', "I'm"]) self.ae(split('out-of-the-box'), ['out-of-the-box']) self.ae(split('-one two-'), ['-one', 'two-']) self.ae(split('-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e']) self.ae(split('-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e']) self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6, 5)]) self.ae(count_words('a b c d e f'), 6) for needle, haystack, pos in ( ('word', 'a word b', 2), ('word', 'a word', 2), ('one-two', 'a one-two punch', 2), ('one-two', 'one-two punch', 0), ('one-two', 'one-two', 0), ('one', 'one-two one', 8), ('one-two', 'one-two-three one-two', 14), ('one', 'onet one', 5), ('two', 'one-two two', 8), ('two', 'two-one two', 8), ('-two', 'one-two -two', 8), ('-two', 'two', -1), ('i', 'i', 0), ('i', 'six i', 4), ('i', '', -1), ('', '', -1), ('', 'i', -1), ('i', 'six clicks', -1), ('i', '\U0001f431 i', 2), ('-a', 'b -a', 2), ('a-', 'a-b a- d', 4), ('-a-', 'b -a -a-', 5), ('-a-', '-a-', 0), ('-a-', 'a-', -1), ('-a-', '-a', -1), ('-a-', 'a', -1), ('a-', 'a-', 0), ('-a', '-a', 0), ('a-b-c-', 'a-b-c-d', -1), ('a-b-c-', 'a-b-c-.', 0), ('a-b-c-', 'a-b-c-d a-b-c- d', 8), ): fpos = index_of(needle, haystack) self.ae( pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))
def test_break_iterator(self): ' Test the break iterator ' from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions for q in ('one two three', ' one two three', 'one\ntwo three ', 'one-two,three'): self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q) self.ae(split(u'I I\'m'), ['I', "I'm"]) self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6 if sys.maxunicode >= 0x10ffff else 7, 5)]) self.ae(0, index_of('i', 'i')) self.ae(4, index_of('i', 'six i')) self.ae(-1, index_of('i', '')) self.ae(-1, index_of('', '')) self.ae(-1, index_of('', 'i')) self.ae(-1, index_of('i', 'six clicks'))
def process_text(state, text, nbsp_format, spell_format, user_data): ans = [] fmt = None if state.is_bold or state.is_italic: fmt = SyntaxTextCharFormat() if state.is_bold: fmt.setFontWeight(QFont.Bold) if state.is_italic: fmt.setFontItalic(True) last = 0 for m in nbsp_pat.finditer(text): ans.extend([(m.start() - last, fmt), (m.end() - m.start(), nbsp_format)]) last = m.end() if not ans: ans = [(len(text), fmt)] elif last < len(text): ans.append((len(text) - last, fmt)) if tprefs['inline_spell_check'] and state.tags and user_data.tag_ok_for_spell(state.tags[-1].name) and hasattr(dictionaries, 'active_user_dictionaries'): split_ans = [] locale = state.current_lang or dictionaries.default_locale sfmt = SyntaxTextCharFormat(spell_format) if fmt is not None: sfmt.merge(fmt) tpos = 0 for tlen, fmt in ans: if fmt is nbsp_format: split_ans.append((tlen, fmt)) else: ctext = text[tpos:tpos+tlen] ppos = 0 for start, length in split_into_words_and_positions(ctext, lang=locale.langcode): if start > ppos: split_ans.append((start - ppos, fmt)) ppos = start + length recognized = dictionaries.recognized(ctext[start:ppos], locale) if not recognized: wsfmt = SyntaxTextCharFormat(sfmt) wsfmt.setProperty(SPELL_PROPERTY, (ctext[start:ppos], locale)) split_ans.append((length, fmt if recognized else wsfmt)) if ppos < tlen: split_ans.append((tlen - ppos, fmt)) tpos += tlen ans = split_ans return ans
def check_spelling(text, tlen, fmt, locale, sfmt, store_locale): split_ans = [] ppos = 0 r, a = dictionaries.recognized, split_ans.append for start, length in split_into_words_and_positions(text, lang=locale.langcode): if start > ppos: a((start - ppos, fmt)) ppos = start + length recognized = r(text[start:ppos], locale) if recognized: a((length, fmt)) else: if store_locale: s = QTextCharFormat(sfmt) s.setProperty(SPELL_LOCALE_PROPERTY, locale) a((length, s)) else: a((length, sfmt)) if ppos < tlen: a((tlen - ppos, fmt)) return split_ans
def _get_epub_standard_word_count(iterator, lang='en'): ''' This algorithm counts individual words instead of pages ''' book_text = _read_epub_contents(iterator, strip_html=True) try: from calibre.spell.break_iterator import count_words wordcount = count_words(book_text, lang) logger.debug('\tWord count - count_words method:%s'%wordcount) except: try: # The above method is new and no-one will have it as of 08/01/2016. Use an older method for a beta. from calibre.spell.break_iterator import split_into_words_and_positions wordcount = len(split_into_words_and_positions(book_text, lang)) logger.debug('\tWord count - split_into_words_and_positions method:%s'%wordcount) except: from calibre.utils.wordcount import get_wordcount_obj wordcount = get_wordcount_obj(book_text) wordcount = wordcount.words logger.debug('\tWord count - old method:%s'%wordcount) return wordcount