Python split_into_words_and_positionsの例、calibre.spell.break_iterator.split_into_words_and_positions Pythonの例

コード例 #1

0

ファイルを表示

def _get_epub_standard_word_count(iterator, lang='en'):
    '''
    This algorithm counts individual words instead of pages
    '''

    book_text = _read_epub_contents(iterator, strip_html=True)

    try:
        from calibre.spell.break_iterator import count_words
        wordcount = count_words(book_text, lang)
        logger.debug('\tWord count - count_words method:%s' % wordcount)
    except:
        try:  # The above method is new and no-one will have it as of 08/01/2016. Use an older method for a beta.
            from calibre.spell.break_iterator import split_into_words_and_positions
            wordcount = len(split_into_words_and_positions(book_text, lang))
            logger.debug(
                '\tWord count - split_into_words_and_positions method:%s' %
                wordcount)
        except:
            from calibre.utils.wordcount import get_wordcount_obj
            wordcount = get_wordcount_obj(book_text)
            wordcount = wordcount.words
            logger.debug('\tWord count - old method:%s' % wordcount)

    return wordcount

コード例 #2

0

ファイルを表示

ファイル: icu_test.py プロジェクト: kba/calibre

 def test_break_iterator(self):
     ' Test the break iterator '
     from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions
     for q in ('one two three', ' one two three', 'one\ntwo  three ', ):
         self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
     self.ae(split(u'I I\'m'), ['I', "I'm"])
     self.ae(split(u'out-of-the-box'), ['out-of-the-box'])
     self.ae(split(u'-one two-'), ['one', 'two'])
     self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)])
     for needle, haystack, pos in (
             ('word', 'a word b', 2),
             ('word', 'a word', 2),
             ('one-two', 'a one-two punch', 2),
             ('one-two', 'one-two punch', 0),
             ('one-two', 'one-two', 0),
             ('one', 'one-two one', 8),
             ('one-two', 'one-two-three one-two', 14),
             ('one', 'onet one', 5),
             ('two', 'one-two two', 8),
             ('i', 'i', 0),
             ('i', 'six i', 4),
             ('i', '', -1), ('', '', -1), ('', 'i', -1),
             ('i', 'six clicks', -1),
             ('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)),
     ):
         fpos = index_of(needle, haystack)
         self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos))

コード例 #3

0

ファイルを表示

def process_text(state, text, nbsp_format, spell_format, user_data):
    ans = []
    fmt = None
    if state.is_bold or state.is_italic:
        fmt = SyntaxTextCharFormat()
        if state.is_bold:
            fmt.setFontWeight(QFont.Bold)
        if state.is_italic:
            fmt.setFontItalic(True)
    last = 0
    for m in nbsp_pat.finditer(text):
        ans.extend([(m.start() - last, fmt),
                    (m.end() - m.start(), nbsp_format)])
        last = m.end()
    if not ans:
        ans = [(len(text), fmt)]
    elif last < len(text):
        ans.append((len(text) - last, fmt))

    if tprefs[
            'inline_spell_check'] and state.tags and user_data.tag_ok_for_spell(
                state.tags[-1].name) and hasattr(dictionaries,
                                                 'active_user_dictionaries'):
        split_ans = []
        locale = state.current_lang or dictionaries.default_locale
        sfmt = SyntaxTextCharFormat(spell_format)
        if fmt is not None:
            sfmt.merge(fmt)

        tpos = 0
        for tlen, fmt in ans:
            if fmt is nbsp_format:
                split_ans.append((tlen, fmt))
            else:
                ctext = text[tpos:tpos + tlen]
                ppos = 0
                for start, length in split_into_words_and_positions(
                        ctext, lang=locale.langcode):
                    if start > ppos:
                        split_ans.append((start - ppos, fmt))
                    ppos = start + length
                    recognized = dictionaries.recognized(
                        ctext[start:ppos], locale)
                    if not recognized:
                        wsfmt = SyntaxTextCharFormat(sfmt)
                        wsfmt.setProperty(SPELL_PROPERTY,
                                          (ctext[start:ppos], locale))
                    split_ans.append((length, fmt if recognized else wsfmt))
                if ppos < tlen:
                    split_ans.append((tlen - ppos, fmt))

            tpos += tlen
        ans = split_ans

    return ans

コード例 #4

0

ファイルを表示

 def test_break_iterator(self):
     ' Test the break iterator '
     from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions, count_words
     for q in (
             'one two three',
             ' one two three',
             'one\ntwo  three ',
     ):
         self.ae(split(str(q)), ['one', 'two', 'three'],
                 'Failed to split: %r' % q)
     self.ae(split('I I\'m'), ['I', "I'm"])
     self.ae(split('out-of-the-box'), ['out-of-the-box'])
     self.ae(split('-one two-'), ['-one', 'two-'])
     self.ae(split('-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e'])
     self.ae(split('-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e'])
     self.ae(split_into_words_and_positions('one \U0001f431 three'),
             [(0, 3), (6, 5)])
     self.ae(count_words('a b c d e f'), 6)
     for needle, haystack, pos in (
         ('word', 'a word b', 2),
         ('word', 'a word', 2),
         ('one-two', 'a one-two punch', 2),
         ('one-two', 'one-two punch', 0),
         ('one-two', 'one-two', 0),
         ('one', 'one-two one', 8),
         ('one-two', 'one-two-three one-two', 14),
         ('one', 'onet one', 5),
         ('two', 'one-two two', 8),
         ('two', 'two-one two', 8),
         ('-two', 'one-two -two', 8),
         ('-two', 'two', -1),
         ('i', 'i', 0),
         ('i', 'six i', 4),
         ('i', '', -1),
         ('', '', -1),
         ('', 'i', -1),
         ('i', 'six clicks', -1),
         ('i', '\U0001f431 i', 2),
         ('-a', 'b -a', 2),
         ('a-', 'a-b a- d', 4),
         ('-a-', 'b -a -a-', 5),
         ('-a-', '-a-', 0),
         ('-a-', 'a-', -1),
         ('-a-', '-a', -1),
         ('-a-', 'a', -1),
         ('a-', 'a-', 0),
         ('-a', '-a', 0),
         ('a-b-c-', 'a-b-c-d', -1),
         ('a-b-c-', 'a-b-c-.', 0),
         ('a-b-c-', 'a-b-c-d a-b-c- d', 8),
     ):
         fpos = index_of(needle, haystack)
         self.ae(
             pos, fpos, 'Failed to find index of %r in %r (%d != %d)' %
             (needle, haystack, pos, fpos))

コード例 #5

0

ファイルを表示

ファイル: icu_test.py プロジェクト: mrmac123/calibre

 def test_break_iterator(self):
     ' Test the break iterator '
     from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions
     for q in ('one two three', ' one two three', 'one\ntwo  three ', 'one-two,three'):
         self.ae(split(unicode(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q)
     self.ae(split(u'I I\'m'), ['I', "I'm"])
     self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6 if sys.maxunicode >= 0x10ffff else 7, 5)])
     self.ae(0, index_of('i', 'i'))
     self.ae(4, index_of('i', 'six i'))
     self.ae(-1, index_of('i', ''))
     self.ae(-1, index_of('', ''))
     self.ae(-1, index_of('', 'i'))
     self.ae(-1, index_of('i', 'six clicks'))

コード例 #6

0

ファイルを表示

ファイル: html.py プロジェクト: dusual/calibre

def process_text(state, text, nbsp_format, spell_format, user_data):
    ans = []
    fmt = None
    if state.is_bold or state.is_italic:
        fmt = SyntaxTextCharFormat()
        if state.is_bold:
            fmt.setFontWeight(QFont.Bold)
        if state.is_italic:
            fmt.setFontItalic(True)
    last = 0
    for m in nbsp_pat.finditer(text):
        ans.extend([(m.start() - last, fmt), (m.end() - m.start(), nbsp_format)])
        last = m.end()
    if not ans:
        ans = [(len(text), fmt)]
    elif last < len(text):
        ans.append((len(text) - last, fmt))

    if tprefs['inline_spell_check'] and state.tags and user_data.tag_ok_for_spell(state.tags[-1].name) and hasattr(dictionaries, 'active_user_dictionaries'):
        split_ans = []
        locale = state.current_lang or dictionaries.default_locale
        sfmt = SyntaxTextCharFormat(spell_format)
        if fmt is not None:
            sfmt.merge(fmt)

        tpos = 0
        for tlen, fmt in ans:
            if fmt is nbsp_format:
                split_ans.append((tlen, fmt))
            else:
                ctext = text[tpos:tpos+tlen]
                ppos = 0
                for start, length in split_into_words_and_positions(ctext, lang=locale.langcode):
                    if start > ppos:
                        split_ans.append((start - ppos, fmt))
                    ppos = start + length
                    recognized = dictionaries.recognized(ctext[start:ppos], locale)
                    if not recognized:
                        wsfmt = SyntaxTextCharFormat(sfmt)
                        wsfmt.setProperty(SPELL_PROPERTY, (ctext[start:ppos], locale))
                    split_ans.append((length, fmt if recognized else wsfmt))
                if ppos < tlen:
                    split_ans.append((tlen - ppos, fmt))

            tpos += tlen
        ans = split_ans

    return ans

コード例 #7

0

ファイルを表示

 def check_spelling(text, tlen, fmt, locale, sfmt, store_locale):
     split_ans = []
     ppos = 0
     r, a = dictionaries.recognized, split_ans.append
     for start, length in split_into_words_and_positions(text, lang=locale.langcode):
         if start > ppos:
             a((start - ppos, fmt))
         ppos = start + length
         recognized = r(text[start:ppos], locale)
         if recognized:
             a((length, fmt))
         else:
             if store_locale:
                 s = QTextCharFormat(sfmt)
                 s.setProperty(SPELL_LOCALE_PROPERTY, locale)
                 a((length, s))
             else:
                 a((length, sfmt))
     if ppos < tlen:
         a((tlen - ppos, fmt))
     return split_ans

コード例 #8

0

ファイルを表示

ファイル: wordcount.py プロジェクト: JimmXinu/FanFicFare

def _get_epub_standard_word_count(iterator, lang='en'):
    '''
    This algorithm counts individual words instead of pages
    '''

    book_text = _read_epub_contents(iterator, strip_html=True)
    
    try:
        from calibre.spell.break_iterator import count_words
        wordcount = count_words(book_text, lang)
        logger.debug('\tWord count - count_words method:%s'%wordcount)
    except:
        try: # The above method is new and no-one will have it as of 08/01/2016. Use an older method for a beta.
            from calibre.spell.break_iterator import split_into_words_and_positions
            wordcount = len(split_into_words_and_positions(book_text, lang))
            logger.debug('\tWord count - split_into_words_and_positions method:%s'%wordcount)
        except:
            from calibre.utils.wordcount import get_wordcount_obj
            wordcount = get_wordcount_obj(book_text)
            wordcount = wordcount.words
            logger.debug('\tWord count - old method:%s'%wordcount)
    
    return wordcount