Python preprocess_textの例

プログラミング言語: Python

名前空間/パッケージ名: wordfreq.preprocess

メソッド/関数: preprocess_text

hotexamples.comのコード掲載数: 7

Python preprocess_text - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのwordfreq.preprocess.preprocess_textの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def preprocess_and_tokenize_text(lang, text):
    """
    Get a string made from the tokens in the text, joined by
    underscores.
    >>> preprocess_and_tokenize_text('en', ' cat')
    'cat'
    >>> preprocess_and_tokenize_text('en', 'Italian supercat')
    'italian_supercat'
    >>> preprocess_and_tokenize_text('en', 'a big dog')
    'a_big_dog'
    >>> preprocess_and_tokenize_text('en', 'Test?!')
    'test'
    >>> preprocess_and_tokenize_text('en', 'TEST.')
    'test'
    >>> preprocess_and_tokenize_text('en', 'test/test')
    'test_test'
    >>> preprocess_and_tokenize_text('de', '   u\N{COMBINING DIAERESIS}ber\\n')
    'über'
    >>> preprocess_and_tokenize_text('en', 'embedded' + chr(9) + 'tab')
    'embedded_tab'
    >>> preprocess_and_tokenize_text('en', '_')
    ''
    >>> preprocess_and_tokenize_text('en', ',')
    ''
    """
    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    return '_'.join(tokens)

コード例 #2

ファイルを表示

ファイル: test_transliteration.py プロジェクト: LuminosoInsight/wordfreq

def test_transliteration():
    # "Well, there's a lot of things you do not understand."
    # (from somewhere in OpenSubtitles
    assert (
        tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
    )
    assert (
        tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
    )

    # I don't have examples of complete sentences in Azerbaijani that are
    # naturally in Cyrillic, because it turns out everyone writes Azerbaijani
    # in Latin letters on the Internet, _except_ sometimes for Wiktionary.
    # So here are some individual words.

    # 'library' in Azerbaijani Cyrillic
    assert preprocess_text('китабхана', 'az') == 'kitabxana'
    assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
    assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'

    # 'scream' in Azerbaijani Cyrillic
    assert preprocess_text('бағырты', 'az') == 'bağırtı'
    assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
    assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'

コード例 #3

ファイルを表示

def standardized_concept_uri(lang, text, *more):
    """
    Make the appropriate URI for a concept in a particular language, including
    removing English stopwords, normalizing the text in a way appropriate
    to that language (using the text normalization from wordfreq), and joining
    its tokens with underscores in a concept URI.

    This text normalization can smooth over some writing differences: for
    example, it removes vowel points from Arabic words, and it transliterates
    Serbian written in the Cyrillic alphabet to the Latin alphabet so that it
    can match other words written in Latin letters.

    'more' contains information to distinguish word senses, such as a part
    of speech or a WordNet domain. The items in 'more' get lowercased and
    joined with underscores, but skip many of the other steps -- for example,
    they won't have stopwords removed.

    >>> standardized_concept_uri('en', 'this is a test')
    '/c/en/this_is_test'
    >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase')
    '/c/en/this_is_test/n/example_phrase'
    >>> standardized_concept_uri('sh', 'симетрија')
    '/c/sh/simetrija'
    """
    lang = lang.lower()
    if lang in LCODE_ALIASES:
        lang = LCODE_ALIASES[lang]
    if lang == 'en':
        token_filter = english_filter
    else:
        token_filter = None

    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    if token_filter is not None:
        tokens = token_filter(tokens)
    norm_text = '_'.join(tokens)
    more_text = []
    for item in more:
        if item is not None:
            tokens = simple_tokenize(item.replace('_', ' '))
            if token_filter is not None:
                tokens = token_filter(tokens)
            more_text.append('_'.join(tokens))

    return concept_uri(lang, norm_text, *more_text)

コード例 #4

ファイルを表示

ファイル: nodes.py プロジェクト: commonsense/conceptnet5

def standardized_concept_uri(lang, text, *more):
    """
    Make the appropriate URI for a concept in a particular language, including
    removing English stopwords, normalizing the text in a way appropriate
    to that language (using the text normalization from wordfreq), and joining
    its tokens with underscores in a concept URI.

    This text normalization can smooth over some writing differences: for
    example, it removes vowel points from Arabic words, and it transliterates
    Serbian written in the Cyrillic alphabet to the Latin alphabet so that it
    can match other words written in Latin letters.

    'more' contains information to distinguish word senses, such as a part
    of speech or a WordNet domain. The items in 'more' get lowercased and
    joined with underscores, but skip many of the other steps -- for example,
    they won't have stopwords removed.

    >>> standardized_concept_uri('en', 'this is a test')
    '/c/en/this_is_test'
    >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase')
    '/c/en/this_is_test/n/example_phrase'
    >>> standardized_concept_uri('sh', 'симетрија')
    '/c/sh/simetrija'
    """
    lang = lang.lower()
    if lang in LCODE_ALIASES:
        lang = LCODE_ALIASES[lang]
    if lang == 'en':
        token_filter = english_filter
    else:
        token_filter = None

    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    if token_filter is not None:
        tokens = token_filter(tokens)
    norm_text = '_'.join(tokens)
    more_text = []
    for item in more:
        if item is not None:
            tokens = simple_tokenize(item.replace('_', ' '))
            if token_filter is not None:
                tokens = token_filter(tokens)
            more_text.append('_'.join(tokens))

    return concept_uri(lang, norm_text, *more_text)

コード例 #5

ファイルを表示

ファイル: nodes.py プロジェクト: commonsense/conceptnet5

def preprocess_and_tokenize_text(lang, text):
    """
    Get a string made from the tokens in the text, joined by
    underscores.

    >>> preprocess_and_tokenize_text('en', ' cat')
    'cat'

    >>> preprocess_and_tokenize_text('en', 'Italian supercat')
    'italian_supercat'

    >>> preprocess_and_tokenize_text('en', 'a big dog')
    'a_big_dog'

    >>> preprocess_and_tokenize_text('en', 'Test?!')
    'test'

    >>> preprocess_and_tokenize_text('en', 'TEST.')
    'test'

    >>> preprocess_and_tokenize_text('en', 'test/test')
    'test_test'

    >>> preprocess_and_tokenize_text('de', '   u\N{COMBINING DIAERESIS}ber\\n')
    'über'

    >>> preprocess_and_tokenize_text('en', 'embedded' + chr(9) + 'tab')
    'embedded_tab'

    >>> preprocess_and_tokenize_text('en', '_')
    ''

    >>> preprocess_and_tokenize_text('en', ',')
    ''
    """
    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    return '_'.join(tokens)

コード例 #6

ファイルを表示

ファイル: test_transliteration.py プロジェクト: yaskapp/wordfreq

def test_transliteration():
    # "Well, there's a lot of things you do not understand."
    # (from somewhere in OpenSubtitles
    assert (tokenize("Па, има ту много ствари које не схваташ.", 'sr') == [
        'pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'
    ])
    assert (tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') == [
        'pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'
    ])

    # I don't have examples of complete sentences in Azerbaijani that are
    # naturally in Cyrillic, because it turns out everyone writes Azerbaijani
    # in Latin letters on the Internet, _except_ sometimes for Wiktionary.
    # So here are some individual words.

    # 'library' in Azerbaijani Cyrillic
    assert preprocess_text('китабхана', 'az') == 'kitabxana'
    assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
    assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'

    # 'scream' in Azerbaijani Cyrillic
    assert preprocess_text('бағырты', 'az') == 'bağırtı'
    assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
    assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'

コード例 #7

ファイルを表示

def freq(word):
    if (word != preprocess_text(word, "fi") or MULTI_DIGIT_RE.fullmatch(word)
            or word.startswith("-") or word.endswith("-")):
        return 0
    return wordfreq.word_frequency(word, "fi")