Exemple #1
0
def preprocess_and_tokenize_text(lang, text):
    """
    Get a string made from the tokens in the text, joined by
    underscores.
    >>> preprocess_and_tokenize_text('en', ' cat')
    'cat'
    >>> preprocess_and_tokenize_text('en', 'Italian supercat')
    'italian_supercat'
    >>> preprocess_and_tokenize_text('en', 'a big dog')
    'a_big_dog'
    >>> preprocess_and_tokenize_text('en', 'Test?!')
    'test'
    >>> preprocess_and_tokenize_text('en', 'TEST.')
    'test'
    >>> preprocess_and_tokenize_text('en', 'test/test')
    'test_test'
    >>> preprocess_and_tokenize_text('de', '   u\N{COMBINING DIAERESIS}ber\\n')
    'über'
    >>> preprocess_and_tokenize_text('en', 'embedded' + chr(9) + 'tab')
    'embedded_tab'
    >>> preprocess_and_tokenize_text('en', '_')
    ''
    >>> preprocess_and_tokenize_text('en', ',')
    ''
    """
    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    return '_'.join(tokens)
def test_transliteration():
    # "Well, there's a lot of things you do not understand."
    # (from somewhere in OpenSubtitles
    assert (
        tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
    )
    assert (
        tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
    )

    # I don't have examples of complete sentences in Azerbaijani that are
    # naturally in Cyrillic, because it turns out everyone writes Azerbaijani
    # in Latin letters on the Internet, _except_ sometimes for Wiktionary.
    # So here are some individual words.

    # 'library' in Azerbaijani Cyrillic
    assert preprocess_text('китабхана', 'az') == 'kitabxana'
    assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
    assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'

    # 'scream' in Azerbaijani Cyrillic
    assert preprocess_text('бағырты', 'az') == 'bağırtı'
    assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
    assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
Exemple #3
0
def standardized_concept_uri(lang, text, *more):
    """
    Make the appropriate URI for a concept in a particular language, including
    removing English stopwords, normalizing the text in a way appropriate
    to that language (using the text normalization from wordfreq), and joining
    its tokens with underscores in a concept URI.

    This text normalization can smooth over some writing differences: for
    example, it removes vowel points from Arabic words, and it transliterates
    Serbian written in the Cyrillic alphabet to the Latin alphabet so that it
    can match other words written in Latin letters.

    'more' contains information to distinguish word senses, such as a part
    of speech or a WordNet domain. The items in 'more' get lowercased and
    joined with underscores, but skip many of the other steps -- for example,
    they won't have stopwords removed.

    >>> standardized_concept_uri('en', 'this is a test')
    '/c/en/this_is_test'
    >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase')
    '/c/en/this_is_test/n/example_phrase'
    >>> standardized_concept_uri('sh', 'симетрија')
    '/c/sh/simetrija'
    """
    lang = lang.lower()
    if lang in LCODE_ALIASES:
        lang = LCODE_ALIASES[lang]
    if lang == 'en':
        token_filter = english_filter
    else:
        token_filter = None

    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    if token_filter is not None:
        tokens = token_filter(tokens)
    norm_text = '_'.join(tokens)
    more_text = []
    for item in more:
        if item is not None:
            tokens = simple_tokenize(item.replace('_', ' '))
            if token_filter is not None:
                tokens = token_filter(tokens)
            more_text.append('_'.join(tokens))

    return concept_uri(lang, norm_text, *more_text)
Exemple #4
0
def standardized_concept_uri(lang, text, *more):
    """
    Make the appropriate URI for a concept in a particular language, including
    removing English stopwords, normalizing the text in a way appropriate
    to that language (using the text normalization from wordfreq), and joining
    its tokens with underscores in a concept URI.

    This text normalization can smooth over some writing differences: for
    example, it removes vowel points from Arabic words, and it transliterates
    Serbian written in the Cyrillic alphabet to the Latin alphabet so that it
    can match other words written in Latin letters.

    'more' contains information to distinguish word senses, such as a part
    of speech or a WordNet domain. The items in 'more' get lowercased and
    joined with underscores, but skip many of the other steps -- for example,
    they won't have stopwords removed.

    >>> standardized_concept_uri('en', 'this is a test')
    '/c/en/this_is_test'
    >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase')
    '/c/en/this_is_test/n/example_phrase'
    >>> standardized_concept_uri('sh', 'симетрија')
    '/c/sh/simetrija'
    """
    lang = lang.lower()
    if lang in LCODE_ALIASES:
        lang = LCODE_ALIASES[lang]
    if lang == 'en':
        token_filter = english_filter
    else:
        token_filter = None

    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    if token_filter is not None:
        tokens = token_filter(tokens)
    norm_text = '_'.join(tokens)
    more_text = []
    for item in more:
        if item is not None:
            tokens = simple_tokenize(item.replace('_', ' '))
            if token_filter is not None:
                tokens = token_filter(tokens)
            more_text.append('_'.join(tokens))

    return concept_uri(lang, norm_text, *more_text)
Exemple #5
0
def preprocess_and_tokenize_text(lang, text):
    """
    Get a string made from the tokens in the text, joined by
    underscores.

    >>> preprocess_and_tokenize_text('en', ' cat')
    'cat'

    >>> preprocess_and_tokenize_text('en', 'Italian supercat')
    'italian_supercat'

    >>> preprocess_and_tokenize_text('en', 'a big dog')
    'a_big_dog'

    >>> preprocess_and_tokenize_text('en', 'Test?!')
    'test'

    >>> preprocess_and_tokenize_text('en', 'TEST.')
    'test'

    >>> preprocess_and_tokenize_text('en', 'test/test')
    'test_test'

    >>> preprocess_and_tokenize_text('de', '   u\N{COMBINING DIAERESIS}ber\\n')
    'über'

    >>> preprocess_and_tokenize_text('en', 'embedded' + chr(9) + 'tab')
    'embedded_tab'

    >>> preprocess_and_tokenize_text('en', '_')
    ''

    >>> preprocess_and_tokenize_text('en', ',')
    ''
    """
    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    return '_'.join(tokens)
def test_transliteration():
    # "Well, there's a lot of things you do not understand."
    # (from somewhere in OpenSubtitles
    assert (tokenize("Па, има ту много ствари које не схваташ.", 'sr') == [
        'pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'
    ])
    assert (tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') == [
        'pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'
    ])

    # I don't have examples of complete sentences in Azerbaijani that are
    # naturally in Cyrillic, because it turns out everyone writes Azerbaijani
    # in Latin letters on the Internet, _except_ sometimes for Wiktionary.
    # So here are some individual words.

    # 'library' in Azerbaijani Cyrillic
    assert preprocess_text('китабхана', 'az') == 'kitabxana'
    assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
    assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'

    # 'scream' in Azerbaijani Cyrillic
    assert preprocess_text('бағырты', 'az') == 'bağırtı'
    assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
    assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
Exemple #7
0
def freq(word):
    if (word != preprocess_text(word, "fi") or MULTI_DIGIT_RE.fullmatch(word)
            or word.startswith("-") or word.endswith("-")):
        return 0
    return wordfreq.word_frequency(word, "fi")