Python basic_tokenizeの例、enchant.tokenize.basic_tokenize Pythonの例

コード例 #1

0

ファイルを表示

ファイル: conf.py プロジェクト: ouh-churchill/diakonia

def process_django_models(app, what, name, obj, options, lines):
    """Append params from fields to model documentation."""
    from django.utils.html import strip_tags
    from django.db import models

    spelling_white_list = ['', '.. spelling::']

    if inspect.isclass(obj) and issubclass(obj, models.Model):
        for field in obj._meta.fields:
            help_text = strip_tags(ugettext(field.help_text))
            verbose_name = ugettext(field.verbose_name)

            if help_text:
                lines.append(':param %s: "%s" - %s' % (field.attname, verbose_name,  help_text))
            else:
                lines.append(':param %s: "%s"' % (field.attname, verbose_name))

            if enchant is not None:
                from enchant.tokenize import basic_tokenize

                words = verbose_name.replace('-', '.').replace('_', '.').split('.')
                words = [s for s in words if s != '']
                for word in words:
                    spelling_white_list += ["    %s" % ''.join(i for i in word if not i.isdigit())]
                    spelling_white_list += ["    %s" % w[0] for w in basic_tokenize(word)]

            field_type = type(field)
            module = field_type.__module__
            if 'django.db.models' in module:
                # scope with django.db.models * imports
                module = 'django.db.models'
            lines.append(':type %s: %s.%s' % (field.attname, module, field_type.__name__))
        if enchant is not None:
            lines += spelling_white_list
    return lines

コード例 #2

0

ファイルを表示

def test_tokenize_strip():
    """Test special-char-stripping edge-cases in basic_tokenize."""
    input = "((' <this> \"\" 'text' has (lots) of (special chars} >>]"
    output = [
        ("<this>", 4),
        ("text", 15),
        ("has", 21),
        ("lots", 26),
        ("of", 32),
        ("special", 36),
        ("chars}", 44),
        (">>", 51),
    ]
    assert output == [i for i in basic_tokenize(input)]
    for (itmO, itmV) in zip(output, basic_tokenize(input)):
        assert itmO, itmV

コード例 #3

0

ファイルを表示

def test_basic_tokenize():
    """Simple regression test for basic white-space tokenization."""
    input = """This is a paragraph.  It's not very special, but it's designed
2 show how the splitter works with many-different combos
of words. Also need to "test" the (handling) of 'quoted' words."""
    output = [
        ("This", 0),
        ("is", 5),
        ("a", 8),
        ("paragraph", 10),
        ("It's", 22),
        ("not", 27),
        ("very", 31),
        ("special", 36),
        ("but", 45),
        ("it's", 49),
        ("designed", 54),
        ("2", 63),
        ("show", 65),
        ("how", 70),
        ("the", 74),
        ("splitter", 78),
        ("works", 87),
        ("with", 93),
        ("many-different", 98),
        ("combos", 113),
        ("of", 120),
        ("words", 123),
        ("Also", 130),
        ("need", 135),
        ("to", 140),
        ("test", 144),
        ("the", 150),
        ("handling", 155),
        ("of", 165),
        ("quoted", 169),
        ("words", 177),
    ]
    assert output == [i for i in basic_tokenize(input)]
    for (itmO, itmV) in zip(output, basic_tokenize(input)):
        assert itmO == itmV

コード例 #4

0

ファイルを表示

ファイル: interactive_spellcheck.py プロジェクト: Mustafabangor/sklearn-svm

 def process_text(self, text):
     """
     accepts: [String] text input
     returns: [List] list of lower-case tokens with URLs filtered out
     """
     try:
         del self.result[:]
         to_check = [] 
         for (word,pos) in basic_tokenize(text): 
             if '@' not in word and 'RT' not in word: to_check.append(word) 
         tknzr = get_tokenizer("en_US",filters=[URLFilter])
         return [word for (word,pos) in tknzr(' '.join(to_check))]
     except UnicodeEncodeError: pass

コード例 #5

0

ファイルを表示

ファイル: conf.py プロジェクト: ouh-churchill/diakonia

def process_modules(app, what, name, obj, options, lines):
    """Add module names to spelling white list."""
    if what != 'module':
        return lines
    from enchant.tokenize import basic_tokenize

    spelling_white_list = ['', '.. spelling::']
    words = name.replace('-', '.').replace('_', '.').split('.')
    words = [s for s in words if s != '']
    for word in words:
        spelling_white_list += ["    %s" % ''.join(i for i in word if not i.isdigit())]
        spelling_white_list += ["    %s" % w[0] for w in basic_tokenize(word)]
    lines += spelling_white_list
    return lines

コード例 #6

0

ファイルを表示

ファイル: conf.py プロジェクト: ouhft/COPE

def process_django_models(app, what, name, obj, options, lines):
    """Append params from fields to model documentation."""
    # print("DEBUG: process_django_models() called: obj={0}".format(obj))
    from django.utils.html import strip_tags
    from django.db import models

    spelling_white_list = ['', '.. spelling::']
    activate('en-gb')

    if inspect.isclass(obj) and issubclass(obj, models.Model):
        for field in obj._meta.fields:
            help_text = strip_tags(ugettext(field.help_text))
            verbose_name = ugettext(field.verbose_name)
            choices = field.choices
            # print("DEBUG: process_django_models() dir(field) = {0}".format(dir(field)))
            # print("DEBUG: process_django_models() field: {0} ; choices = {1}".format(field, choices))

            if help_text:
                lines.append(':param {0}: "{1}" - {2}'.format(
                    field.attname, verbose_name, help_text))
            else:
                lines.append(':param {0}: "{1}"'.format(
                    field.attname, verbose_name))

            if enchant is not None:
                from enchant.tokenize import basic_tokenize

                words = verbose_name.replace('-', '.').replace('_',
                                                               '.').split('.')
                words = [s for s in words if s != '']
                for word in words:
                    spelling_white_list += [
                        "    %s" % ''.join(i for i in word if not i.isdigit())
                    ]
                    spelling_white_list += [
                        "    %s" % w[0] for w in basic_tokenize(word)
                    ]

            field_type = type(field)
            module = field_type.__module__
            if 'django.db.models' in module:
                # scope with django.db.models * imports
                module = 'django.db.models'
            lines.append(':type %s: %s.%s' %
                         (field.attname, module, field_type.__name__))
        if enchant is not None:
            lines += spelling_white_list
    return lines

コード例 #7

0

ファイルを表示

ファイル: conf.py プロジェクト: OskarPersson/ESSArch_Tools_Producer

def process_modules(app, what, name, obj, options, lines):
    """Add module names to spelling white list."""
    if what != 'module':
        return lines
    from enchant.tokenize import basic_tokenize

    spelling_white_list = ['', '.. spelling::']
    words = name.replace('-', '.').replace('_', '.').split('.')
    words = [s for s in words if s != '']
    for word in words:
        spelling_white_list += [
            "    %s" % ''.join(i for i in word if not i.isdigit())
        ]
        spelling_white_list += ["    %s" % w[0] for w in basic_tokenize(word)]
    lines += spelling_white_list
    return lines

コード例 #8

0

ファイルを表示

ファイル: conf.py プロジェクト: OskarPersson/ESSArch_Tools_Producer

def process_django_models(app, what, name, obj, options, lines):
    """Append params from fields to model documentation."""
    from django.utils.encoding import force_text
    from django.utils.html import strip_tags
    from django.db import models

    spelling_white_list = ['', '.. spelling::']

    if inspect.isclass(obj) and issubclass(obj, models.Model):
        for field in obj._meta.fields:
            help_text = strip_tags(force_text(field.help_text))
            verbose_name = force_text(field.verbose_name).capitalize()

            if help_text:
                lines.append(':param %s: %s - %s' %
                             (field.attname, verbose_name, help_text))
            else:
                lines.append(':param %s: %s' % (field.attname, verbose_name))

            if enchant is not None:
                from enchant.tokenize import basic_tokenize

                words = verbose_name.replace('-', '.').replace('_',
                                                               '.').split('.')
                words = [s for s in words if s != '']
                for word in words:
                    spelling_white_list += [
                        "    %s" % ''.join(i for i in word if not i.isdigit())
                    ]
                    spelling_white_list += [
                        "    %s" % w[0] for w in basic_tokenize(word)
                    ]

            field_type = type(field)
            module = field_type.__module__
            if 'django.db.models' in module:
                # scope with django.db.models * imports
                module = 'django.db.models'
            lines.append(':type %s: %s.%s' %
                         (field.attname, module, field_type.__name__))
        if enchant is not None:
            lines += spelling_white_list
    return lines